/*************************************************************************/ /*!
@File           cache_km.c
@Title          CPU d-cache maintenance operations framework
@Copyright      Copyright (c) Imagination Technologies Ltd. All Rights Reserved
@Description    Implements server side code for CPU d-cache maintenance taking
                into account the idiosyncrasies of the various types of CPU
                d-cache instruction-set architecture (ISA) maintenance
                mechanisms.
@License        Dual MIT/GPLv2

The contents of this file are subject to the MIT license as set out below.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

Alternatively, the contents of this file may be used under the terms of
the GNU General Public License Version 2 ("GPL") in which case the provisions
of GPL are applicable instead of those above.

If you wish to allow use of your version of this file only under the terms of
GPL, and not to allow others to use your version of this file under the terms
of the MIT license, indicate your decision by deleting the provisions above
and replace them with the notice and other provisions required by GPL as set
out in the file called "GPL-COPYING" included in this distribution. If you do
not delete the provisions above, a recipient may use your version of this file
under the terms of either the MIT license or GPL.

This License is also included in this distribution in the file called
"MIT-COPYING".

EXCEPT AS OTHERWISE STATED IN A NEGOTIATED AGREEMENT: (A) THE SOFTWARE IS
PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE AND NONINFRINGEMENT; AND (B) IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ /**************************************************************************/
#if defined(LINUX)
#include <linux/version.h>
#include <linux/uaccess.h>
#include <asm/current.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#endif

#include "pmr.h"
#include "log2.h"
#include "device.h"
#include "pvrsrv.h"
#include "osfunc.h"
#include "cache_km.h"
#include "pvr_debug.h"
#include "lock_types.h"
#include "allocmem.h"
#include "process_stats.h"
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
#include "ri_server.h"
#endif
#include "devicemem.h"
#include "pvrsrv_apphint.h"
#include "pvrsrv_sync_server.h"
#include "km_apphint_defs.h"

/* This header must always be included last */
#if defined(LINUX)
#include "kernel_compatibility.h"
#endif

/* Top-level file-local build definitions */
#if defined(PVRSRV_ENABLE_CACHEOP_STATS) && defined(LINUX)
#define CACHEOP_DEBUG
#define CACHEOP_STATS_ITEMS_MAX 			32
#define INCR_WRAP(x)						((x+1) >= CACHEOP_STATS_ITEMS_MAX ? 0 : (x+1))
#define DECR_WRAP(x)						((x-1) < 0 ? (CACHEOP_STATS_ITEMS_MAX-1) : (x-1))
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
/* Refer to CacheOpStatsExecLogHeader() for header item names */
#define CACHEOP_RI_PRINTF_HEADER			"%-8s %-10s %-10s %-5s %-16s %-16s %-10s %-10s %-18s %-18s %-12s"
#define CACHEOP_RI_PRINTF					"%-8d %-10s %-10s %-5s 0x%-14llx 0x%-14llx 0x%-8llx 0x%-8llx %-18llu %-18llu 0x%-10x\n"
#else
#define CACHEOP_PRINTF_HEADER				"%-8s %-10s %-10s %-5s %-10s %-10s %-18s %-18s %-12s"
#define CACHEOP_PRINTF						"%-8d %-10s %-10s %-5s 0x%-8llx 0x%-8llx %-18llu %-18llu 0x%-10x\n"
#endif
#endif

//#define CACHEOP_NO_CACHE_LINE_ALIGNED_ROUNDING		/* Force OS page (not cache line) flush granularity */
#define CACHEOP_PVR_ASSERT(x)							/* Define as PVR_ASSERT(x), enable for swdev & testing */
#if defined(PVRSRV_SERVER_THREADS_INDEFINITE_SLEEP)
#define CACHEOP_THREAD_WAIT_TIMEOUT			0ULL		/* Wait indefinitely */
#else
#define CACHEOP_THREAD_WAIT_TIMEOUT			500000ULL	/* Wait 500ms between wait unless woken-up on demand */
#endif
#define CACHEOP_FENCE_WAIT_TIMEOUT			1000ULL		/* Wait 1ms between wait events unless woken-up */
#define CACHEOP_FENCE_RETRY_ABORT			1000ULL		/* Fence retries that aborts fence operation */
#define CACHEOP_SEQ_MIDPOINT (IMG_UINT32)	0x7FFFFFFF	/* Where seqNum(s) are rebase, compared at */
#define CACHEOP_ABORT_FENCE_ERROR_STRING	"detected stalled client, retrying cacheop fence"
#define CACHEOP_NO_GFLUSH_ERROR_STRING		"global flush requested on CPU without support"
#define CACHEOP_DEVMEM_OOR_ERROR_STRING		"cacheop device memory request is out of range"
#define CACHEOP_MAX_DEBUG_MESSAGE_LEN		160

typedef struct _CACHEOP_WORK_ITEM_
{
	PMR *psPMR;
	IMG_UINT32 ui32GFSeqNum;
	IMG_UINT32 ui32OpSeqNum;
	IMG_DEVMEM_SIZE_T uiSize;
	PVRSRV_CACHE_OP uiCacheOp;
	IMG_DEVMEM_OFFSET_T uiOffset;
	PVRSRV_TIMELINE iTimeline;
	SYNC_TIMELINE_OBJ sSWTimelineObj;
	PVRSRV_DEVICE_NODE *psDevNode;
#if defined(CACHEOP_DEBUG)
	IMG_UINT64 ui64EnqueuedTime;
	IMG_UINT64 ui64DequeuedTime;
	IMG_UINT64 ui64ExecuteTime;
	IMG_BOOL bDeferred;
	IMG_BOOL bKMReq;
	IMG_BOOL bRBF;
	IMG_BOOL bUMF;
	IMG_PID pid;
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
	RGXFWIF_DM eFenceOpType;
#endif
#endif
} CACHEOP_WORK_ITEM;

typedef struct _CACHEOP_STATS_EXEC_ITEM_
{
	IMG_PID pid;
	IMG_UINT32 ui32OpSeqNum;
	PVRSRV_CACHE_OP uiCacheOp;
	IMG_DEVMEM_SIZE_T uiOffset;
	IMG_DEVMEM_SIZE_T uiSize;
	IMG_UINT64 ui64EnqueuedTime;
	IMG_UINT64 ui64DequeuedTime;
	IMG_UINT64 ui64ExecuteTime;
	IMG_BOOL bIsFence;
	IMG_BOOL bKMReq;
	IMG_BOOL bRBF;
	IMG_BOOL bUMF;
	IMG_BOOL bDeferred;
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
	IMG_DEV_VIRTADDR sDevVAddr;
	IMG_DEV_PHYADDR sDevPAddr;
	RGXFWIF_DM eFenceOpType;
#endif
} CACHEOP_STATS_EXEC_ITEM;

typedef enum _CACHEOP_CONFIG_
{
	CACHEOP_CONFIG_DEFAULT = 0,
	/* cache flush mechanism types */
	CACHEOP_CONFIG_KRBF    = 1,
	CACHEOP_CONFIG_KGF     = 2,
	CACHEOP_CONFIG_URBF    = 4,
	/* sw-emulated deferred flush mechanism */
	CACHEOP_CONFIG_KDF     = 8,
	/* pseudo configuration items */
	CACHEOP_CONFIG_LAST    = 16,
	CACHEOP_CONFIG_KLOG    = 16,
	CACHEOP_CONFIG_ALL     = 31
} CACHEOP_CONFIG;

typedef struct _CACHEOP_WORK_QUEUE_
{
/*
 * Init. state & primary device node framework
 * is anchored on.
 */
	IMG_BOOL bInit;
/*
  MMU page size/shift & d-cache line size
 */
	size_t uiPageSize;
	IMG_UINT32 uiLineSize;
	IMG_UINT32 uiLineShift;
	IMG_UINT32 uiPageShift;
	PVRSRV_CACHE_OP_ADDR_TYPE uiCacheOpAddrType;
/*
  CacheOp deferred queueing protocol
  + Implementation geared for performance, atomic counter based
	- Value Space is 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7 -> 8 -> n.
	- Index Space is 0 -> 1 -> 2 -> 3 -> 0 -> 1 -> 2 -> 3 -> 0 -> m.
		- Index = Value modulo CACHEOP_INDICES_LOG2_SIZE.
  + Write counter never collides with read counter in index space
	- Unless at start of day when both are initialised to zero.
	- This means we sacrifice one entry when the queue is full.
	- Incremented by producer
		- Value space tracks total number of CacheOps queued.
		- Index space identifies CacheOp CCB queue index.
  + Read counter increments towards write counter in value space
	- Empty queue occurs when read equals write counter.
	- Wrap-round logic handled by consumer as/when needed.
	- Incremented by consumer
		- Value space tracks total # of CacheOps executed.
		- Index space identifies CacheOp CCB queue index.
  + Total queued size adjusted up/down during write/read activity
	- Counter might overflow but does not compromise framework.
 */
	ATOMIC_T hReadCounter;
	ATOMIC_T hWriteCounter;
/*
  CacheOp sequence numbers
  + hCommonSeqNum:
	- Common sequence, numbers every CacheOp operation in both UM/KM.
	- In KM
		- Every deferred CacheOp (on behalf of UM) gets a unique seqNum.
		- Last executed deferred CacheOp updates gsCwq.hCompletedSeqNum.
		- Every GF operation (if supported) also gets a unique seqNum.
		- Last executed GF operation updates CACHEOP_INFO_GFSEQNUM0.
		- Under debug, all CacheOp gets a unique seqNum for tracking.
		- This includes all UM/KM synchronous non-deferred CacheOp(s)
	- In UM
		- If the processor architecture supports GF maintenance (in KM)
		- All UM CacheOp samples CACHEOP_INFO_GFSEQNUM0 via info. page.
		- CacheOp(s) discarded if another GF occurs before execution.
		- CacheOp(s) discarding happens in both UM and KM space.
  + hCompletedSeqNum:
	- Tracks last executed KM/deferred RBF/Global<timeline> CacheOp(s)
  + hDeferredSize:
	- Running total of size of currently deferred CacheOp in queue.
 */
	ATOMIC_T hDeferredSize;
	ATOMIC_T hCommonSeqNum;
	ATOMIC_T hCompletedSeqNum;
/*
  CacheOp information page
  + psInfoPagePMR:
	- Single system-wide OS page that is multi-mapped in UM/KM.
	- Mapped into clients using read-only memory protection.
	- Mapped into server using read/write memory protection.
	- Contains information pertaining to cache framework.
  + pui32InfoPage:
	- Server linear address pointer to said information page.
	- Each info-page entry currently of sizeof(IMG_UINT32).
 */
	PMR *psInfoPagePMR;
	IMG_UINT32 *pui32InfoPage;
/*
  CacheOp deferred work-item queue
  + CACHEOP_INDICES_LOG2_SIZE
	- Sized using GF/RBF ratio
 */
#define CACHEOP_INDICES_LOG2_SIZE	(4)
#define CACHEOP_INDICES_MAX			(1 << CACHEOP_INDICES_LOG2_SIZE)
#define CACHEOP_INDICES_MASK		(CACHEOP_INDICES_MAX-1)
	CACHEOP_WORK_ITEM asWorkItems[CACHEOP_INDICES_MAX];
#if defined(CACHEOP_DEBUG)
/*
  CacheOp statistics
 */
	void *pvStatsEntry;
	IMG_HANDLE hStatsExecLock;
	IMG_UINT32 ui32ServerASync;
	IMG_UINT32 ui32ServerSyncVA;
	IMG_UINT32 ui32ServerSync;
	IMG_UINT32 ui32ServerRBF;
	IMG_UINT32 ui32ServerGF;
	IMG_UINT32 ui32ServerDGF;
	IMG_UINT32 ui32ServerDTL;
	IMG_UINT32 ui32ClientSync;
	IMG_UINT32 ui32ClientRBF;
	IMG_UINT32 ui32KMDiscards;
	IMG_UINT32 ui32UMDiscards;
	IMG_UINT32 ui32TotalFenceOps;
	IMG_UINT32 ui32TotalExecOps;
	IMG_UINT32 ui32AvgExecTime;
	IMG_UINT32 ui32AvgFenceTime;
	IMG_INT32 i32StatsExecWriteIdx;
	CACHEOP_STATS_EXEC_ITEM asStatsExecuted[CACHEOP_STATS_ITEMS_MAX];
#endif
/*
  CacheOp (re)configuration
 */
	void *pvConfigTune;
	IMG_HANDLE hConfigLock;
/*
  CacheOp deferred worker thread
  + eConfig
	- Runtime configuration
  + hWorkerThread
	- CacheOp thread handler
  + hThreadWakeUpEvtObj
	- Event object to drive CacheOp worker thread sleep/wake-ups.
  + hClientWakeUpEvtObj
	- Event object to unblock stalled clients waiting on queue.
 */
	CACHEOP_CONFIG	eConfig;
	IMG_UINT32		ui32Config;
	IMG_HANDLE		hWorkerThread;
	IMG_HANDLE 		hDeferredLock;
	IMG_HANDLE 		hGlobalFlushLock;
	IMG_HANDLE		hThreadWakeUpEvtObj;
	IMG_HANDLE		hClientWakeUpEvtObj;
	IMG_UINT32		ui32FenceWaitTimeUs;
	IMG_UINT32		ui32FenceRetryAbort;
	IMG_BOOL		bNoGlobalFlushImpl;
	IMG_BOOL		bSupportsUMFlush;
} CACHEOP_WORK_QUEUE;

/* Top-level CacheOp framework object */
static CACHEOP_WORK_QUEUE gsCwq;

#define CacheOpConfigSupports(e) ((gsCwq.eConfig & (e)) ? IMG_TRUE : IMG_FALSE)

static INLINE IMG_UINT32 CacheOpIdxRead(ATOMIC_T *phCounter)
{
	IMG_UINT32 ui32Idx = OSAtomicRead(phCounter);
	return ui32Idx & CACHEOP_INDICES_MASK;
}

static INLINE IMG_UINT32 CacheOpIdxIncrement(ATOMIC_T *phCounter)
{
	IMG_UINT32 ui32Idx = OSAtomicIncrement(phCounter);
	return ui32Idx & CACHEOP_INDICES_MASK;
}

static INLINE IMG_UINT32 CacheOpIdxNext(ATOMIC_T *phCounter)
{
	IMG_UINT32 ui32Idx = OSAtomicRead(phCounter);
	return ++ui32Idx & CACHEOP_INDICES_MASK;
}

static INLINE IMG_UINT32 CacheOpIdxSpan(ATOMIC_T *phLhs, ATOMIC_T *phRhs)
{
	return OSAtomicRead(phLhs) - OSAtomicRead(phRhs);
}

static INLINE IMG_UINT64 DivBy10(IMG_UINT64 uiNum)
{
	IMG_UINT64 uiQuot;
	IMG_UINT64 uiRem;

	uiQuot = (uiNum >> 1) + (uiNum >> 2);
	uiQuot = uiQuot + (uiQuot >> 4);
	uiQuot = uiQuot + (uiQuot >> 8);
	uiQuot = uiQuot + (uiQuot >> 16);
	uiQuot = uiQuot >> 3;
	uiRem  = uiNum - (((uiQuot << 2) + uiQuot) << 1);

	return uiQuot + (uiRem > 9);
}

/* Callback to dump info of cacheop thread in debug_dump */
static void CacheOpThreadDumpInfo(DUMPDEBUG_PRINTF_FUNC* pfnDumpDebugPrintf,
				  				void *pvDumpDebugFile)
{
	PVR_DUMPDEBUG_LOG("    Configuration: QSZ: %d, UKT: %d, KDFT: %d, "
			  "KGFT: %d, LINESIZE: %d, PGSIZE: %d, KDF: %s, "
			  "URBF: %s, KGF: %s, KRBF: %s",
			  CACHEOP_INDICES_MAX,
			  gsCwq.pui32InfoPage[CACHEOP_INFO_UMKMTHRESHLD],
			  gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD],
			  gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD],
			  gsCwq.pui32InfoPage[CACHEOP_INFO_LINESIZE],
			  gsCwq.pui32InfoPage[CACHEOP_INFO_PGSIZE],
			  gsCwq.eConfig & CACHEOP_CONFIG_KDF  ? "Yes" : "No",
			  gsCwq.eConfig & CACHEOP_CONFIG_URBF ? "Yes" : "No",
			  gsCwq.eConfig & CACHEOP_CONFIG_KGF  ? "Yes" : "No",
			  gsCwq.eConfig & CACHEOP_CONFIG_KRBF ? "Yes" : "No"
			  );
	PVR_DUMPDEBUG_LOG("    Pending deferred CacheOp entries : %u",
			  CacheOpIdxSpan(&gsCwq.hWriteCounter, &gsCwq.hReadCounter));
}

#if defined(CACHEOP_DEBUG)
static INLINE void CacheOpStatsExecLogHeader(IMG_CHAR szBuffer[CACHEOP_MAX_DEBUG_MESSAGE_LEN])
{
	OSSNPrintf(szBuffer, CACHEOP_MAX_DEBUG_MESSAGE_LEN,
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
				CACHEOP_RI_PRINTF_HEADER,
#else
				CACHEOP_PRINTF_HEADER,
#endif
				"Pid",
				"CacheOp",
				"  Type",
				"Mode",
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
				"DevVAddr",
				"DevPAddr",
#endif
				"Offset",
				"Size",
				"xTime (us)",
				"qTime (us)",
				"SeqNum");
}

static void CacheOpStatsExecLogWrite(CACHEOP_WORK_ITEM *psCacheOpWorkItem)
{
	IMG_UINT64 ui64ExecuteTime;
	IMG_UINT64 ui64EnqueuedTime;
	IMG_INT32 i32WriteOffset;

	if (!psCacheOpWorkItem->ui32OpSeqNum && !psCacheOpWorkItem->uiCacheOp)
	{
		/* This breaks the logic of read-out, so we do not queue items
		   with zero sequence number and no CacheOp */
		return;
	}
	else if (psCacheOpWorkItem->bKMReq && !CacheOpConfigSupports(CACHEOP_CONFIG_KLOG))
	{
		/* KM logs spams the history due to frequency, this remove its completely */
		return;
	}

	OSLockAcquire(gsCwq.hStatsExecLock);

	i32WriteOffset = gsCwq.i32StatsExecWriteIdx;
	gsCwq.asStatsExecuted[i32WriteOffset].pid = psCacheOpWorkItem->pid;
	gsCwq.i32StatsExecWriteIdx = INCR_WRAP(gsCwq.i32StatsExecWriteIdx);
	gsCwq.asStatsExecuted[i32WriteOffset].bRBF = psCacheOpWorkItem->bRBF;
	gsCwq.asStatsExecuted[i32WriteOffset].bUMF = psCacheOpWorkItem->bUMF;
	gsCwq.asStatsExecuted[i32WriteOffset].uiSize = psCacheOpWorkItem->uiSize;
	gsCwq.asStatsExecuted[i32WriteOffset].bKMReq = psCacheOpWorkItem->bKMReq;
	gsCwq.asStatsExecuted[i32WriteOffset].uiOffset	= psCacheOpWorkItem->uiOffset;
	gsCwq.asStatsExecuted[i32WriteOffset].uiCacheOp = psCacheOpWorkItem->uiCacheOp;
	gsCwq.asStatsExecuted[i32WriteOffset].bDeferred = psCacheOpWorkItem->bDeferred;
	gsCwq.asStatsExecuted[i32WriteOffset].ui32OpSeqNum	= psCacheOpWorkItem->ui32OpSeqNum;
	gsCwq.asStatsExecuted[i32WriteOffset].ui64ExecuteTime = psCacheOpWorkItem->ui64ExecuteTime;
	gsCwq.asStatsExecuted[i32WriteOffset].ui64EnqueuedTime = psCacheOpWorkItem->ui64EnqueuedTime;
	gsCwq.asStatsExecuted[i32WriteOffset].ui64DequeuedTime = psCacheOpWorkItem->ui64DequeuedTime;
	/* During early system initialisation, only non-fence & non-PMR CacheOps are processed */
	gsCwq.asStatsExecuted[i32WriteOffset].bIsFence = gsCwq.bInit && !psCacheOpWorkItem->psPMR;
	CACHEOP_PVR_ASSERT(gsCwq.asStatsExecuted[i32WriteOffset].pid);
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
	if (gsCwq.bInit && psCacheOpWorkItem->psPMR)
	{
		IMG_CPU_PHYADDR sDevPAddr;
		PVRSRV_ERROR eError;
		IMG_BOOL bValid;

		/* Get more detailed information regarding the sub allocations that
		   PMR has from RI manager for process that requested the CacheOp */
		eError = RIDumpProcessListKM(psCacheOpWorkItem->psPMR,
									 gsCwq.asStatsExecuted[i32WriteOffset].pid,
									 gsCwq.asStatsExecuted[i32WriteOffset].uiOffset,
									 &gsCwq.asStatsExecuted[i32WriteOffset].sDevVAddr);
		if (eError != PVRSRV_OK)
		{
			goto e0;
		}

		/* (Re)lock here as some PMR might have not been locked */
		eError = PMRLockSysPhysAddresses(psCacheOpWorkItem->psPMR);
		if (eError != PVRSRV_OK)
		{
			goto e0;
		}

		eError = PMR_CpuPhysAddr(psCacheOpWorkItem->psPMR,
								 gsCwq.uiPageShift,
								 1,
								 gsCwq.asStatsExecuted[i32WriteOffset].uiOffset,
								 &sDevPAddr,
								 &bValid);
		if (eError != PVRSRV_OK)
		{
			eError = PMRUnlockSysPhysAddresses(psCacheOpWorkItem->psPMR);
			PVR_LOG_IF_ERROR(eError, "PMRUnlockSysPhysAddresses");
			goto e0;
		}

		eError = PMRUnlockSysPhysAddresses(psCacheOpWorkItem->psPMR);
		PVR_LOG_IF_ERROR(eError, "PMRUnlockSysPhysAddresses");

		gsCwq.asStatsExecuted[i32WriteOffset].sDevPAddr.uiAddr = sDevPAddr.uiAddr;
	}

	if (gsCwq.asStatsExecuted[i32WriteOffset].bIsFence)
	{
		gsCwq.asStatsExecuted[i32WriteOffset].eFenceOpType = psCacheOpWorkItem->eFenceOpType;
	}
#endif

	/* Convert timing from nano-seconds to micro-seconds */
	ui64ExecuteTime = gsCwq.asStatsExecuted[i32WriteOffset].ui64ExecuteTime;
	ui64EnqueuedTime = gsCwq.asStatsExecuted[i32WriteOffset].ui64EnqueuedTime;
	ui64ExecuteTime = DivBy10(DivBy10(DivBy10(ui64ExecuteTime)));
	ui64EnqueuedTime = DivBy10(DivBy10(DivBy10(ui64EnqueuedTime)));

	/* Coalesced (to global) deferred CacheOps do not contribute to statistics,
	   as both enqueue/execute time is identical for these CacheOps */
	if (!gsCwq.asStatsExecuted[i32WriteOffset].bIsFence)
	{
		/* Calculate the rolling approximate average execution time */
		IMG_UINT32 ui32Time = ui64EnqueuedTime < ui64ExecuteTime ?
									ui64ExecuteTime - ui64EnqueuedTime :
									ui64EnqueuedTime - ui64ExecuteTime;
		if (gsCwq.ui32TotalExecOps > 2 && ui32Time)
		{
			gsCwq.ui32AvgExecTime -= (gsCwq.ui32AvgExecTime / gsCwq.ui32TotalExecOps);
			gsCwq.ui32AvgExecTime += (ui32Time / gsCwq.ui32TotalExecOps);
		}
		else if (ui32Time)
		{
			gsCwq.ui32AvgExecTime = (IMG_UINT32)ui32Time;
		}
	}

	if (! gsCwq.asStatsExecuted[i32WriteOffset].bKMReq)
	{
		/* This operation queues only UM CacheOp in per-PID process statistics database */
		PVRSRVStatsUpdateCacheOpStats(gsCwq.asStatsExecuted[i32WriteOffset].uiCacheOp,
						gsCwq.asStatsExecuted[i32WriteOffset].ui32OpSeqNum,
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
						gsCwq.asStatsExecuted[i32WriteOffset].sDevVAddr,
						gsCwq.asStatsExecuted[i32WriteOffset].sDevPAddr,
						gsCwq.asStatsExecuted[i32WriteOffset].eFenceOpType,
#endif
						gsCwq.asStatsExecuted[i32WriteOffset].uiOffset,
						gsCwq.asStatsExecuted[i32WriteOffset].uiSize,
						ui64EnqueuedTime < ui64ExecuteTime ?
							ui64ExecuteTime - ui64EnqueuedTime:
							ui64EnqueuedTime - ui64ExecuteTime,
						gsCwq.asStatsExecuted[i32WriteOffset].bRBF,
						gsCwq.asStatsExecuted[i32WriteOffset].bUMF,
						gsCwq.asStatsExecuted[i32WriteOffset].bIsFence,
						psCacheOpWorkItem->pid);
	}

#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
e0:
#endif
	OSLockRelease(gsCwq.hStatsExecLock);
}

static void CacheOpStatsExecLogRead(void *pvFilePtr, void *pvData,
								OS_STATS_PRINTF_FUNC* pfnOSStatsPrintf)
{
	IMG_CHAR *pszFlushype;
	IMG_CHAR *pszCacheOpType;
	IMG_CHAR *pszFlushSource;
	IMG_INT32 i32ReadOffset;
	IMG_INT32 i32WriteOffset;
	IMG_UINT64 ui64EnqueuedTime;
	IMG_UINT64 ui64DequeuedTime;
	IMG_UINT64 ui64ExecuteTime;
	IMG_CHAR szBuffer[CACHEOP_MAX_DEBUG_MESSAGE_LEN] = {0};
	PVR_UNREFERENCED_PARAMETER(pvData);

	OSLockAcquire(gsCwq.hStatsExecLock);

	pfnOSStatsPrintf(pvFilePtr,
			"Primary CPU d-cache architecture: LSZ: 0x%d, URBF: %s, KGF: %s, KRBF: %s\n",
			gsCwq.uiLineSize,
			gsCwq.bSupportsUMFlush ? "Yes" : "No",
			!gsCwq.bNoGlobalFlushImpl ? "Yes" : "No",
			"Yes" /* KRBF mechanism always available */
		);

	pfnOSStatsPrintf(pvFilePtr,
			"Configuration: QSZ: %d, UKT: %d, KDFT: %d, KGFT: %d, KDF: %s, URBF: %s, KGF: %s, KRBF: %s\n",
			CACHEOP_INDICES_MAX,
			gsCwq.pui32InfoPage[CACHEOP_INFO_UMKMTHRESHLD],
			gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD],
			gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD],
			gsCwq.eConfig & CACHEOP_CONFIG_KDF  ? "Yes" : "No",
			gsCwq.eConfig & CACHEOP_CONFIG_URBF ? "Yes" : "No",
			gsCwq.eConfig & CACHEOP_CONFIG_KGF  ? "Yes" : "No",
			gsCwq.eConfig & CACHEOP_CONFIG_KRBF ? "Yes" : "No"
		);

	pfnOSStatsPrintf(pvFilePtr,
			"Summary: OP[F][TL] (tot.avg): %d.%d/%d.%d/%d, [KM][UM][A]SYNC: %d.%d/%d/%d, RBF (um/km): %d/%d, [D]GF (km): %d/%d, DSC (um/km): %d/%d\n",
			gsCwq.ui32TotalExecOps, gsCwq.ui32AvgExecTime, gsCwq.ui32TotalFenceOps, gsCwq.ui32AvgFenceTime, gsCwq.ui32ServerDTL,
			gsCwq.ui32ServerSync, gsCwq.ui32ServerSyncVA, gsCwq.ui32ClientSync,	gsCwq.ui32ServerASync,
			gsCwq.ui32ClientRBF,   gsCwq.ui32ServerRBF,
			gsCwq.ui32ServerDGF,   gsCwq.ui32ServerGF,
			gsCwq.ui32UMDiscards,  gsCwq.ui32KMDiscards
		);

	CacheOpStatsExecLogHeader(szBuffer);
	pfnOSStatsPrintf(pvFilePtr, "%s\n", szBuffer);

	i32WriteOffset = gsCwq.i32StatsExecWriteIdx;
	for (i32ReadOffset = DECR_WRAP(i32WriteOffset);
		 i32ReadOffset != i32WriteOffset;
		 i32ReadOffset = DECR_WRAP(i32ReadOffset))
	{
		if (!gsCwq.asStatsExecuted[i32ReadOffset].ui32OpSeqNum &&
			!gsCwq.asStatsExecuted[i32ReadOffset].uiCacheOp)
		{
			break;
		}

		/* Convert from nano-seconds to micro-seconds */
		ui64ExecuteTime = gsCwq.asStatsExecuted[i32ReadOffset].ui64ExecuteTime;
		ui64EnqueuedTime = gsCwq.asStatsExecuted[i32ReadOffset].ui64EnqueuedTime;
		ui64DequeuedTime = gsCwq.asStatsExecuted[i32ReadOffset].ui64DequeuedTime;
		ui64ExecuteTime = DivBy10(DivBy10(DivBy10(ui64ExecuteTime)));
		ui64EnqueuedTime = DivBy10(DivBy10(DivBy10(ui64EnqueuedTime)));
		ui64DequeuedTime = ui64DequeuedTime ? DivBy10(DivBy10(DivBy10(ui64DequeuedTime))) : 0;

		if (gsCwq.asStatsExecuted[i32ReadOffset].bIsFence)
		{
			IMG_CHAR *pszMode = "";
			IMG_CHAR *pszFenceType = "";
			pszCacheOpType = "Fence";

#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
			pszMode = gsCwq.asStatsExecuted[i32ReadOffset].uiCacheOp != PVRSRV_CACHE_OP_GLOBAL ? "" : "  GF  ";
			switch (gsCwq.asStatsExecuted[i32ReadOffset].eFenceOpType)
			{
				case RGXFWIF_DM_GP:
					pszFenceType = " GP/GF";
					break;

				case RGXFWIF_DM_TDM:
					pszFenceType = "  TDM ";
					break;

				case RGXFWIF_DM_TA:
					pszFenceType = "  TA ";
					break;

				case RGXFWIF_DM_3D:
					pszFenceType = "  PDM ";
					break;

				case RGXFWIF_DM_CDM:
					pszFenceType = "  CDM ";
					break;

				default:
					CACHEOP_PVR_ASSERT(0);
					break;
			}
#else
			/* The CacheOp fence operation also triggered a global cache flush operation */
			pszFenceType =
				gsCwq.asStatsExecuted[i32ReadOffset].uiCacheOp != PVRSRV_CACHE_OP_GLOBAL ? "" : "   GF ";
#endif
			pfnOSStatsPrintf(pvFilePtr,
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
							CACHEOP_RI_PRINTF,
#else
							CACHEOP_PRINTF,
#endif
							gsCwq.asStatsExecuted[i32ReadOffset].pid,
							pszCacheOpType,
							pszFenceType,
							pszMode,
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
							"",
							"",
#endif
							gsCwq.asStatsExecuted[i32ReadOffset].uiOffset,
							gsCwq.asStatsExecuted[i32ReadOffset].uiSize,
							ui64EnqueuedTime < ui64ExecuteTime ?
									ui64ExecuteTime - ui64EnqueuedTime
										:
									ui64EnqueuedTime - ui64ExecuteTime,
							ui64EnqueuedTime < ui64DequeuedTime ?
									ui64DequeuedTime - ui64EnqueuedTime
										:
									!ui64DequeuedTime ? 0 : ui64EnqueuedTime - ui64DequeuedTime,
							gsCwq.asStatsExecuted[i32ReadOffset].ui32OpSeqNum);
		}
		else
		{
			if (gsCwq.asStatsExecuted[i32ReadOffset].bRBF)
			{
				IMG_DEVMEM_SIZE_T ui64NumOfPages;

				ui64NumOfPages = gsCwq.asStatsExecuted[i32ReadOffset].uiSize >> gsCwq.uiPageShift;
				if (ui64NumOfPages <= PMR_MAX_TRANSLATION_STACK_ALLOC)
				{
					pszFlushype = "RBF.Fast";
				}
				else
				{
					pszFlushype = "RBF.Slow";
				}
			}
			else
			{
				pszFlushype = "   GF ";
			}

			if (gsCwq.asStatsExecuted[i32ReadOffset].bUMF)
			{
				pszFlushSource = " UM";
			}
			else
			{
				/*
				   - Request originates directly from a KM thread or in KM (KM<), or
				   - Request originates from a UM thread and is KM deferred (KM+), or
				   - Request is/was discarded due to an 'else-[when,where]' GFlush
				     - i.e. GF occurs either (a)sync to current UM/KM thread
				*/
				pszFlushSource =
					gsCwq.asStatsExecuted[i32ReadOffset].bKMReq ? " KM<" :
					gsCwq.asStatsExecuted[i32ReadOffset].bDeferred && gsCwq.asStatsExecuted[i32ReadOffset].ui64ExecuteTime ? " KM+" :
					!gsCwq.asStatsExecuted[i32ReadOffset].ui64ExecuteTime ? " KM-" : " KM";
			}

			switch (gsCwq.asStatsExecuted[i32ReadOffset].uiCacheOp)
			{
				case PVRSRV_CACHE_OP_NONE:
					pszCacheOpType = "None";
					break;
				case PVRSRV_CACHE_OP_CLEAN:
					pszCacheOpType = "Clean";
					break;
				case PVRSRV_CACHE_OP_INVALIDATE:
					pszCacheOpType = "Invalidate";
					break;
				case PVRSRV_CACHE_OP_FLUSH:
					pszCacheOpType = "Flush";
					break;
				case PVRSRV_CACHE_OP_GLOBAL:
					pszCacheOpType = "GFlush";
					break;
				case PVRSRV_CACHE_OP_TIMELINE:
					pszCacheOpType = "Timeline";
					pszFlushype = "      ";
					break;
				default:
					if ((IMG_UINT32)gsCwq.asStatsExecuted[i32ReadOffset].uiCacheOp == (IMG_UINT32)(PVRSRV_CACHE_OP_GLOBAL|PVRSRV_CACHE_OP_TIMELINE))
					{
						pszCacheOpType = "Timeline";
					}
					else
					{
						pszCacheOpType = "Unknown";
						gsCwq.asStatsExecuted[i32ReadOffset].ui32OpSeqNum =
								(IMG_UINT32) gsCwq.asStatsExecuted[i32ReadOffset].uiCacheOp;
					}
					break;
			}

			pfnOSStatsPrintf(pvFilePtr,
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
							CACHEOP_RI_PRINTF,
#else
							CACHEOP_PRINTF,
#endif
							gsCwq.asStatsExecuted[i32ReadOffset].pid,
							pszCacheOpType,
							pszFlushype,
							pszFlushSource,
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
							gsCwq.asStatsExecuted[i32ReadOffset].sDevVAddr.uiAddr,
							gsCwq.asStatsExecuted[i32ReadOffset].sDevPAddr.uiAddr,
#endif
							gsCwq.asStatsExecuted[i32ReadOffset].uiOffset,
							gsCwq.asStatsExecuted[i32ReadOffset].uiSize,
							ui64EnqueuedTime < ui64ExecuteTime ?
										ui64ExecuteTime - ui64EnqueuedTime
											:
										ui64EnqueuedTime - ui64ExecuteTime,
							ui64EnqueuedTime < ui64DequeuedTime ?
									ui64DequeuedTime - ui64EnqueuedTime
										:
									!ui64DequeuedTime ? 0 : ui64EnqueuedTime - ui64DequeuedTime,
							gsCwq.asStatsExecuted[i32ReadOffset].ui32OpSeqNum);
		}
	}

	OSLockRelease(gsCwq.hStatsExecLock);
}
#endif /* defined(CACHEOP_DEBUG) */

static INLINE void CacheOpStatsReset(void)
{
#if defined(CACHEOP_DEBUG)
	gsCwq.ui32KMDiscards    = 0;
	gsCwq.ui32UMDiscards    = 0;
	gsCwq.ui32TotalExecOps  = 0;
	gsCwq.ui32TotalFenceOps = 0;
	gsCwq.ui32AvgExecTime   = 0;
	gsCwq.ui32AvgFenceTime  = 0;
	gsCwq.ui32ClientRBF     = 0;
	gsCwq.ui32ClientSync    = 0;
	gsCwq.ui32ServerRBF     = 0;
	gsCwq.ui32ServerASync   = 0;
	gsCwq.ui32ServerSyncVA  = 0;
	gsCwq.ui32ServerSync    = 0;
	gsCwq.ui32ServerGF      = 0;
	gsCwq.ui32ServerDGF     = 0;
	gsCwq.ui32ServerDTL     = 0;
	gsCwq.i32StatsExecWriteIdx = 0;
	OSCachedMemSet(gsCwq.asStatsExecuted, 0, sizeof(gsCwq.asStatsExecuted));
#endif
}

static void CacheOpConfigUpdate(IMG_UINT32 ui32Config)
{
	OSLockAcquire(gsCwq.hConfigLock);

	/* Step 0, set the gsCwq.eConfig bits */
	if (!(ui32Config & (CACHEOP_CONFIG_LAST - 1)))
	{
		gsCwq.eConfig = CACHEOP_CONFIG_KRBF | CACHEOP_CONFIG_KDF;
		if (! gsCwq.bNoGlobalFlushImpl)
		{
			gsCwq.eConfig |= CACHEOP_CONFIG_KGF;
		}
		if (gsCwq.bSupportsUMFlush)
		{
			gsCwq.eConfig |= CACHEOP_CONFIG_URBF;
		}
	}
	else
	{
		if (ui32Config & CACHEOP_CONFIG_KRBF)
		{
			gsCwq.eConfig |= CACHEOP_CONFIG_KRBF;
		}
		else
		{
			gsCwq.eConfig &= ~CACHEOP_CONFIG_KRBF;
		}

		if (ui32Config & CACHEOP_CONFIG_KDF)
		{
			gsCwq.eConfig |= CACHEOP_CONFIG_KDF;
		}
		else
		{
			gsCwq.eConfig &= ~CACHEOP_CONFIG_KDF;
		}

		if (!gsCwq.bNoGlobalFlushImpl && (ui32Config & CACHEOP_CONFIG_KGF))
		{
			gsCwq.eConfig |= CACHEOP_CONFIG_KGF;
		}
		else
		{
			gsCwq.eConfig &= ~CACHEOP_CONFIG_KGF;
		}

		if (gsCwq.bSupportsUMFlush && (ui32Config & CACHEOP_CONFIG_URBF))
		{
			gsCwq.eConfig |= CACHEOP_CONFIG_URBF;
		}
		else
		{
			gsCwq.eConfig &= ~CACHEOP_CONFIG_URBF;
		}
	}

	if (ui32Config & CACHEOP_CONFIG_KLOG)
	{
		/* Suppress logs from KM caller */
		gsCwq.eConfig |= CACHEOP_CONFIG_KLOG;
	}
	else
	{
		gsCwq.eConfig &= ~CACHEOP_CONFIG_KLOG;
	}

	/* Step 1, set gsCwq.ui32Config based on gsCwq.eConfig */
	ui32Config = 0;
	if (gsCwq.eConfig & CACHEOP_CONFIG_KRBF)
	{
		ui32Config |= CACHEOP_CONFIG_KRBF;
	}
	if (gsCwq.eConfig & CACHEOP_CONFIG_KDF)
	{
		ui32Config |= CACHEOP_CONFIG_KDF;
	}
	if (gsCwq.eConfig & CACHEOP_CONFIG_KGF)
	{
		ui32Config |= CACHEOP_CONFIG_KGF;
	}
	if (gsCwq.eConfig & CACHEOP_CONFIG_URBF)
	{
		ui32Config |= CACHEOP_CONFIG_URBF;
	}
	if (gsCwq.eConfig & CACHEOP_CONFIG_KLOG)
	{
		ui32Config |= CACHEOP_CONFIG_KLOG;
	}
	gsCwq.ui32Config = ui32Config;

	/* Step 2, Bar RBF promotion to GF, unless a GF is implemented */
	gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD] = (IMG_UINT32)~0;
	if (! gsCwq.bNoGlobalFlushImpl)
	{
		gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD] = (IMG_UINT32)PVR_DIRTY_BYTES_FLUSH_THRESHOLD;
	}

	/* Step 3, in certain cases where a CacheOp/VA is provided, this threshold determines at what point
	   the optimisation due to the presence of said VA (i.e. us not having to remap the PMR pages in KM)
	   is clawed-back because of the overhead of maintaining such large request which might stalls the
	   user thread; so to hide this latency have these CacheOps executed on deferred CacheOp thread */
	gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD] = (IMG_UINT32)(PVR_DIRTY_BYTES_FLUSH_THRESHOLD >> 2);

	/* Step 4, if no UM support, all requests are done in KM so zero these forcing all client requests
	   to come down into the KM for maintenance */
	gsCwq.pui32InfoPage[CACHEOP_INFO_UMKMTHRESHLD] = 0;
	gsCwq.pui32InfoPage[CACHEOP_INFO_UMRBFONLY] = 0;
	if (gsCwq.bSupportsUMFlush)
	{
		/* If URBF has been selected exclusively OR selected but there is no GF implementation */
		if ((gsCwq.eConfig & CACHEOP_CONFIG_URBF) &&
			(gsCwq.bNoGlobalFlushImpl || !((gsCwq.ui32Config & (CACHEOP_CONFIG_LAST-1)) & ~CACHEOP_CONFIG_URBF)))
		{
			/* If only URBF has been selected, simulate without GF support OR no GF support means all client
			   requests should be done in UM. In both cases, set this threshold to the highest value to
			   prevent any client requests coming down to the server for maintenance. */
			gsCwq.pui32InfoPage[CACHEOP_INFO_UMKMTHRESHLD] = (IMG_UINT32)~0;
			gsCwq.pui32InfoPage[CACHEOP_INFO_UMRBFONLY] = 1;
		}
		/* This is the default entry for setting the UM info. page entries */
		else if ((gsCwq.eConfig & CACHEOP_CONFIG_URBF) && !gsCwq.bNoGlobalFlushImpl)
		{
			/* Set UM/KM threshold, all request sizes above this goes to server for GF maintenance _only_
			   because UM flushes already have VA acquired, no cost is incurred in per-page (re)mapping
			   of the to-be maintained PMR/page(s) as it the case with KM flushing so disallow KDF */

			/* Assume an average UM flush performance, anything above should be promoted to GF.
			   For x86 UMA/LMA, we avoid KDF because remapping PMR/pages in KM might fail due to exhausted
			   or fragmented VMALLOC kernel VA space */
			gsCwq.pui32InfoPage[CACHEOP_INFO_UMKMTHRESHLD] = gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD];
		}
	}

	/* Step 5, reset stats. */
	CacheOpStatsReset();

	OSLockRelease(gsCwq.hConfigLock);
}

static INLINE void CacheOpConfigRead(void *pvFilePtr,
									 void *pvData,
									 OS_STATS_PRINTF_FUNC* pfnOSStatsPrintf)
{
	PVR_UNREFERENCED_PARAMETER(pvData);
	pfnOSStatsPrintf(pvFilePtr,
			"KDF: %s, URBF: %s, KGF: %s, KRBF: %s\n",
			gsCwq.eConfig & CACHEOP_CONFIG_KDF  ? "Yes" : "No",
			gsCwq.eConfig & CACHEOP_CONFIG_URBF ? "Yes" : "No",
			gsCwq.eConfig & CACHEOP_CONFIG_KGF  ? "Yes" : "No",
			gsCwq.eConfig & CACHEOP_CONFIG_KRBF ? "Yes" : "No"
		);
}

static INLINE PVRSRV_ERROR CacheOpConfigQuery(const PVRSRV_DEVICE_NODE *psDevNode,
											const void *psPrivate,
											IMG_UINT32 *pui32Value)
{
	IMG_UINT32 ui32ID = (IMG_UINT32)(uintptr_t) psPrivate;
	PVR_UNREFERENCED_PARAMETER(psDevNode);

	switch (ui32ID)
	{
		case APPHINT_ID_CacheOpConfig:
			*pui32Value = gsCwq.ui32Config;
			break;

		case APPHINT_ID_CacheOpGFThresholdSize:
			*pui32Value = gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD];
			break;

		case APPHINT_ID_CacheOpUMKMThresholdSize:
			*pui32Value = gsCwq.pui32InfoPage[CACHEOP_INFO_UMKMTHRESHLD];
			break;

		default:
			break;
	}

	return PVRSRV_OK;
}

static INLINE PVRSRV_ERROR CacheOpConfigSet(const PVRSRV_DEVICE_NODE *psDevNode,
											const void *psPrivate,
											IMG_UINT32 ui32Value)
{
	IMG_UINT32 ui32ID = (IMG_UINT32)(uintptr_t) psPrivate;
	PVR_UNREFERENCED_PARAMETER(psDevNode);

	switch (ui32ID)
	{
		case APPHINT_ID_CacheOpConfig:
			CacheOpConfigUpdate(ui32Value & CACHEOP_CONFIG_ALL);
			break;

		case APPHINT_ID_CacheOpGFThresholdSize:
		{
			if (!ui32Value || gsCwq.bNoGlobalFlushImpl)
			{
				/* CPU ISA does not support GF, silently ignore request to adjust threshold */
				PVR_ASSERT(gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD] == (IMG_UINT32)~0);
				break;
			}
			else if (ui32Value < gsCwq.uiPageSize)
			{
				/* Silently round-up to OS page size */
				ui32Value = gsCwq.uiPageSize;
			}

			/* Align to OS page size */
			ui32Value &= ~(gsCwq.uiPageSize - 1);

			/* Adjust KM deferred threshold given this updated KM global threshold */
			if (ui32Value == gsCwq.uiPageSize || ui32Value < gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD])
			{
				gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD] = ui32Value >> 2;
			}

			gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD] = ui32Value;

			break;
		}

		case APPHINT_ID_CacheOpUMKMThresholdSize:
		{
			if (!ui32Value || !gsCwq.bSupportsUMFlush)
			{
				/* CPU ISA does not support UM flush, therefore every request goes down into
				   the KM, silently ignore request to adjust threshold */
				PVR_ASSERT(! gsCwq.pui32InfoPage[CACHEOP_INFO_UMKMTHRESHLD]);
				break;
			}
			else if (ui32Value < gsCwq.uiPageSize)
			{
				/* Silently round-up to OS page size */
				ui32Value = gsCwq.uiPageSize;
			}

			/* Align to OS page size */
			ui32Value &= ~(gsCwq.uiPageSize - 1);

			if (gsCwq.bNoGlobalFlushImpl || ui32Value < gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD])
			{
				/* CPU ISA does not support GF also, therefore it seems there is no benefit to
				   re-routing this to KM as request won't be promoted to GF but request can
				   benefit from KM async. execution so ensure KM deferred threshold applies */
				PVR_ASSERT(gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD] == (IMG_UINT32)~0);
				gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD] = ui32Value >> 1;
				break;
			}

			gsCwq.pui32InfoPage[CACHEOP_INFO_UMKMTHRESHLD] = ui32Value;

			break;
		}

		default:
			break;
	}

	return PVRSRV_OK;
}

static INLINE void CacheOpQItemRecycle(CACHEOP_WORK_ITEM *psCacheOpWorkItem)
{
	PVRSRV_ERROR eError;
	eError = PMRUnlockSysPhysAddresses(psCacheOpWorkItem->psPMR);
	PVR_LOG_IF_ERROR(eError, "PMRUnlockSysPhysAddresses");
	/* Set to max as precaution should recycling this CacheOp index fail
	   to reset it, this is purely to safe-guard having to discard such
	   subsequent deferred CacheOps or signal the sw sync timeline */
	psCacheOpWorkItem->iTimeline = PVRSRV_NO_TIMELINE;
	psCacheOpWorkItem->ui32GFSeqNum = (IMG_UINT32)~0;
	psCacheOpWorkItem->ui32OpSeqNum = (IMG_UINT32)~0;
#if defined(CACHEOP_DEBUG)
	psCacheOpWorkItem->psPMR = (void *)(uintptr_t)~0;
#endif
}

static INLINE void CacheOpQItemReadCheck(CACHEOP_WORK_ITEM *psCacheOpWorkItem)
{
#if defined(CACHEOP_DEBUG)
	CACHEOP_PVR_ASSERT(psCacheOpWorkItem->psPMR);
	CACHEOP_PVR_ASSERT(psCacheOpWorkItem->psPMR != (void *)(uintptr_t)~0);
	CACHEOP_PVR_ASSERT(psCacheOpWorkItem->ui32OpSeqNum != (IMG_UINT32)~0);
	if (CacheOpConfigSupports(CACHEOP_CONFIG_KGF))
	{
		CACHEOP_PVR_ASSERT(psCacheOpWorkItem->ui32GFSeqNum != (IMG_UINT32)~0);
	}
#else
	PVR_UNREFERENCED_PARAMETER(psCacheOpWorkItem);
#endif
}

static INLINE void CacheOpQItemWriteCheck(CACHEOP_WORK_ITEM *psCacheOpWorkItem)
{
#if defined(CACHEOP_DEBUG)
	CACHEOP_PVR_ASSERT(psCacheOpWorkItem->psPMR == (void *)(uintptr_t)~0);
	CACHEOP_PVR_ASSERT(psCacheOpWorkItem->ui32OpSeqNum == (IMG_UINT32)~0);
	CACHEOP_PVR_ASSERT(psCacheOpWorkItem->ui32GFSeqNum == (IMG_UINT32)~0);
	CACHEOP_PVR_ASSERT(psCacheOpWorkItem->iTimeline == PVRSRV_NO_TIMELINE);
#else
	PVR_UNREFERENCED_PARAMETER(psCacheOpWorkItem);
#endif
}

static INLINE IMG_UINT32 CacheOpGetNextCommonSeqNum(void)
{
	IMG_UINT32 ui32SeqNum = OSAtomicIncrement(&gsCwq.hCommonSeqNum);
	if (! ui32SeqNum)
	{
		/* Zero is _not_ a valid sequence value, doing so simplifies _all_
		   subsequent fence checking when no cache maintenance operation
		   is outstanding as in this case a fence value of zero is supplied. */
		if (CacheOpConfigSupports(CACHEOP_CONFIG_KGF))
		{
			/* Also when seqNum wraps around/crosses zero, this requires us to
			   ensure that GFSEQNUM is not erroneously higher than any/all client
			   seqNum(s) in the system during this wrap-around transition so we
			   disable both momentarily until the next GF comes along. This has
			   the effect that all subsequent in-flight discards using ">" is
			   never true seeing zero is _not_ greater than anything and all
			   "<=" comparison are always true seeing zero is always less than
			   all non-zero integers. The additional GF here is done mostly to
			   account for race condition(s) during this transition for all
			   pending seqNum(s) that are still behind zero. */
			gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] = 0;
			gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM1] = 0;
			ui32SeqNum = OSAtomicIncrement(&gsCwq.hCommonSeqNum);
			(void) OSCPUOperation(PVRSRV_CACHE_OP_FLUSH);
		}
		else
		{
			ui32SeqNum = OSAtomicIncrement(&gsCwq.hCommonSeqNum);
		}
	}
	return ui32SeqNum;
}

static INLINE IMG_BOOL CacheOpFenceCheck(IMG_UINT32 ui32CompletedSeqNum,
										 IMG_UINT32 ui32FenceSeqNum)
{
	IMG_UINT32 ui32RebasedCompletedNum;
	IMG_UINT32 ui32RebasedFenceNum;
	IMG_UINT32 ui32Rebase;

	if (ui32FenceSeqNum == 0)
	{
		return IMG_TRUE;
	}

	/*
	   The problem statement is how to compare two values on
	   a numerical sequentially incrementing timeline in the
	   presence of wrap around arithmetic semantics using a
	   single ui32 counter & atomic (increment) operations.

	   The rationale for the solution here is to rebase the
	   incoming values to the sequence midpoint and perform
	   comparisons there; this allows us to handle overflow
	   or underflow wrap-round using only a single integer.

	   NOTE: Here we assume that the absolute value of the
	   difference between the two incoming values in _not_
	   greater than CACHEOP_SEQ_MIDPOINT. This assumption
	   holds as it implies that it is very _unlikely_ that 2
	   billion CacheOp requests could have been made between
	   a single client's CacheOp request & the corresponding
	   fence check. This code sequence is hopefully a _more_
	   hand optimised (branchless) version of this:

		   x = ui32CompletedOpSeqNum
		   y = ui32FenceOpSeqNum

		   if (|x - y| < CACHEOP_SEQ_MIDPOINT)
			   return (x - y) >= 0 ? true : false
		   else
			   return (y - x) >= 0 ? true : false
	 */
	ui32Rebase = CACHEOP_SEQ_MIDPOINT - ui32CompletedSeqNum;

	/* ui32Rebase could be either positive/negative, in
	   any case we still perform operation using unsigned
	   semantics as 2's complement notation always means
	   we end up with the correct result */
	ui32RebasedCompletedNum = ui32Rebase + ui32CompletedSeqNum;
	ui32RebasedFenceNum = ui32Rebase + ui32FenceSeqNum;

	return (ui32RebasedCompletedNum >= ui32RebasedFenceNum);
}

static INLINE PVRSRV_ERROR CacheOpTimelineBind(PVRSRV_DEVICE_NODE *psDevNode,
											   CACHEOP_WORK_ITEM *psCacheOpWorkItem,
											   PVRSRV_TIMELINE iTimeline)
{
	PVRSRV_ERROR eError;

	/* Always default the incoming CacheOp work-item to safe values */
	SyncClearTimelineObj(&psCacheOpWorkItem->sSWTimelineObj);
	psCacheOpWorkItem->iTimeline = PVRSRV_NO_TIMELINE;
	psCacheOpWorkItem->psDevNode = psDevNode;
	if (iTimeline == PVRSRV_NO_TIMELINE)
	{
		return PVRSRV_OK;
	}

#if defined(PVR_USE_FENCE_SYNC_MODEL)
	psCacheOpWorkItem->iTimeline = iTimeline;
	eError = SyncSWGetTimelineObj(iTimeline, &psCacheOpWorkItem->sSWTimelineObj);
	PVR_LOG_IF_ERROR(eError, "SyncSWGetTimelineObj");
#else
	eError = PVRSRV_ERROR_NOT_IMPLEMENTED;
#endif

	return eError;
}

static INLINE PVRSRV_ERROR CacheOpTimelineExec(CACHEOP_WORK_ITEM *psCacheOpWorkItem)
{
	PVRSRV_ERROR eError;

	if (psCacheOpWorkItem->iTimeline == PVRSRV_NO_TIMELINE)
	{
		return PVRSRV_OK;
	}
	CACHEOP_PVR_ASSERT(psCacheOpWorkItem->sSWTimelineObj.pvTlObj);

#if defined(PVR_USE_FENCE_SYNC_MODEL)
	eError = SyncSWTimelineAdvanceKM(psCacheOpWorkItem->psDevNode,
	                                 &psCacheOpWorkItem->sSWTimelineObj);
	(void) SyncSWTimelineReleaseKM(&psCacheOpWorkItem->sSWTimelineObj);
#else
	eError = PVRSRV_ERROR_NOT_IMPLEMENTED;
#endif

	return eError;
}

static INLINE PVRSRV_ERROR CacheOpGlobalFlush(void)
{
#if !defined(CACHEFLUSH_ISA_SUPPORTS_GLOBAL_FLUSH)
	return PVRSRV_ERROR_NOT_IMPLEMENTED;
#else
	PVRSRV_ERROR eError;
	IMG_UINT32 ui32OpSeqNum = gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0];

	if (! CacheOpConfigSupports(CACHEOP_CONFIG_KGF))
	{
		return PVRSRV_ERROR_NOT_SUPPORTED;
	}

	OSLockAcquire(gsCwq.hGlobalFlushLock);
	if (ui32OpSeqNum < gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0])
	{
#if defined(CACHEOP_DEBUG)
		gsCwq.ui32KMDiscards += 1;
#endif
		eError = PVRSRV_OK;
		goto exit;
	}

	/* User space sampling the information-page seqNumbers after this point
	   and before the corresponding GFSEQNUM0 update leads to an invalid
	   sampling which must be discarded by UM. This implements a lockless
	   critical region for a single KM(writer) & multiple UM/KM(readers) */
	ui32OpSeqNum = CacheOpGetNextCommonSeqNum();
	gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM1] = ui32OpSeqNum;

	eError = OSCPUOperation(PVRSRV_CACHE_OP_FLUSH);
	PVR_LOGG_IF_ERROR(eError, "OSCPUOperation(PVRSRV_CACHE_OP_FLUSH)", exit);

	gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] = ui32OpSeqNum;
	OSAtomicWrite(&gsCwq.hDeferredSize, 0);
#if defined(CACHEOP_DEBUG)
	gsCwq.ui32ServerGF += 1;
#endif

exit:
	OSLockRelease(gsCwq.hGlobalFlushLock);
	return eError;
#endif
}

static INLINE void CacheOpExecRangeBased(PVRSRV_DEVICE_NODE *psDevNode,
										PVRSRV_CACHE_OP uiCacheOp,
										IMG_BYTE *pbCpuVirtAddr,
										IMG_CPU_PHYADDR sCpuPhyAddr,
										IMG_DEVMEM_OFFSET_T uiPgAlignedOffset,
										IMG_DEVMEM_OFFSET_T uiCLAlignedStartOffset,
										IMG_DEVMEM_OFFSET_T uiCLAlignedEndOffset)
{
	IMG_BYTE *pbCpuVirtAddrEnd;
	IMG_BYTE *pbCpuVirtAddrStart;
	IMG_CPU_PHYADDR sCpuPhyAddrEnd;
	IMG_CPU_PHYADDR sCpuPhyAddrStart;
	IMG_DEVMEM_SIZE_T uiRelFlushSize;
	IMG_DEVMEM_OFFSET_T uiRelFlushOffset;
	IMG_DEVMEM_SIZE_T uiNextPgAlignedOffset;

	/* These quantities allows us to perform cache operations
	   at cache-line granularity thereby ensuring we do not
	   perform more than is necessary */
	CACHEOP_PVR_ASSERT(uiPgAlignedOffset < uiCLAlignedEndOffset);
	uiRelFlushSize = (IMG_DEVMEM_SIZE_T)gsCwq.uiPageSize;
	uiRelFlushOffset = 0;

	if (uiCLAlignedStartOffset > uiPgAlignedOffset)
	{
		/* Zero unless initially starting at an in-page offset */
		uiRelFlushOffset = uiCLAlignedStartOffset - uiPgAlignedOffset;
		uiRelFlushSize -= uiRelFlushOffset;
	}

	/* uiRelFlushSize is gsCwq.uiPageSize unless current outstanding CacheOp
	   size is smaller. The 1st case handles in-page CacheOp range and
	   the 2nd case handles multiple-page CacheOp range with a last
	   CacheOp size that is less than gsCwq.uiPageSize */
	uiNextPgAlignedOffset = uiPgAlignedOffset + (IMG_DEVMEM_SIZE_T)gsCwq.uiPageSize;
	if (uiNextPgAlignedOffset < uiPgAlignedOffset)
	{
		/* uiNextPgAlignedOffset is greater than uiCLAlignedEndOffset
		   by implication of this wrap-round; this only happens when
		   uiPgAlignedOffset is the last page aligned offset */
		uiRelFlushSize = uiRelFlushOffset ?
				uiCLAlignedEndOffset - uiCLAlignedStartOffset :
				uiCLAlignedEndOffset - uiPgAlignedOffset;
	}
	else
	{
		if (uiNextPgAlignedOffset > uiCLAlignedEndOffset)
		{
			uiRelFlushSize = uiRelFlushOffset ?
					uiCLAlignedEndOffset - uiCLAlignedStartOffset :
					uiCLAlignedEndOffset - uiPgAlignedOffset;
		}
	}

	/* More efficient to request cache maintenance operation for full
	   relative range as opposed to multiple cache-aligned ranges */
	sCpuPhyAddrStart.uiAddr = sCpuPhyAddr.uiAddr + uiRelFlushOffset;
	sCpuPhyAddrEnd.uiAddr = sCpuPhyAddrStart.uiAddr + uiRelFlushSize;
	if (pbCpuVirtAddr)
	{
		pbCpuVirtAddrStart = pbCpuVirtAddr + uiRelFlushOffset;
		pbCpuVirtAddrEnd = pbCpuVirtAddrStart + uiRelFlushSize;
	}
	else
	{
		/* Some OS/Env layer support functions expect NULL(s) */
		pbCpuVirtAddrStart = NULL;
		pbCpuVirtAddrEnd = NULL;
	}

	/* Perform requested CacheOp on the CPU data cache for successive cache
	   line worth of bytes up to page or in-page cache-line boundary */
	switch (uiCacheOp)
	{
		case PVRSRV_CACHE_OP_CLEAN:
			OSCPUCacheCleanRangeKM(psDevNode, pbCpuVirtAddrStart, pbCpuVirtAddrEnd,
									sCpuPhyAddrStart, sCpuPhyAddrEnd);
			break;
		case PVRSRV_CACHE_OP_INVALIDATE:
			OSCPUCacheInvalidateRangeKM(psDevNode, pbCpuVirtAddrStart, pbCpuVirtAddrEnd,
									sCpuPhyAddrStart, sCpuPhyAddrEnd);
			break;
		case PVRSRV_CACHE_OP_FLUSH:
			OSCPUCacheFlushRangeKM(psDevNode, pbCpuVirtAddrStart, pbCpuVirtAddrEnd,
									sCpuPhyAddrStart, sCpuPhyAddrEnd);
			break;
		default:
			PVR_DPF((PVR_DBG_ERROR,	"%s: Invalid cache operation type %d",
					__func__, uiCacheOp));
			break;
	}

#if defined(CACHEOP_DEBUG)
	/* Tracks the number of kernel-mode cacheline maintenance instructions */
	gsCwq.ui32ServerRBF += (uiRelFlushSize & ((IMG_DEVMEM_SIZE_T)~(gsCwq.uiLineSize - 1))) >> gsCwq.uiLineShift;
#endif
}

static INLINE void CacheOpExecRangeBasedVA(PVRSRV_DEVICE_NODE *psDevNode,
										 IMG_CPU_VIRTADDR pvAddress,
										 IMG_DEVMEM_SIZE_T uiSize,
										 PVRSRV_CACHE_OP uiCacheOp)
{
	IMG_CPU_PHYADDR sCpuPhyAddrUnused =
		{ IMG_CAST_TO_CPUPHYADDR_UINT(0xCAFEF00DDEADBEEFULL) };
	IMG_BYTE *pbEnd = (IMG_BYTE*)((uintptr_t)pvAddress + (uintptr_t)uiSize);
	IMG_BYTE *pbStart = (IMG_BYTE*)((uintptr_t)pvAddress & ~((uintptr_t)gsCwq.uiLineSize-1));

	/*
	  If the start/end address isn't aligned to cache line size, round it up to the
	  nearest multiple; this ensures that we flush all the cache lines affected by
	  unaligned start/end addresses.
	 */
	pbEnd = (IMG_BYTE *) PVR_ALIGN((uintptr_t)pbEnd, (uintptr_t)gsCwq.uiLineSize);
	switch (uiCacheOp)
	{
		case PVRSRV_CACHE_OP_CLEAN:
			OSCPUCacheCleanRangeKM(psDevNode, pbStart, pbEnd, sCpuPhyAddrUnused, sCpuPhyAddrUnused);
			break;
		case PVRSRV_CACHE_OP_INVALIDATE:
			OSCPUCacheInvalidateRangeKM(psDevNode, pbStart, pbEnd, sCpuPhyAddrUnused, sCpuPhyAddrUnused);
			break;
		case PVRSRV_CACHE_OP_FLUSH:
			OSCPUCacheFlushRangeKM(psDevNode, pbStart, pbEnd, sCpuPhyAddrUnused, sCpuPhyAddrUnused);
			break;
		default:
			PVR_DPF((PVR_DBG_ERROR,	"%s: Invalid cache operation type %d",
					 __func__, uiCacheOp));
			break;
	}

#if defined(CACHEOP_DEBUG)
	/* Tracks the number of kernel-mode cacheline maintenance instructions */
	gsCwq.ui32ServerRBF += (uiSize & ((IMG_DEVMEM_SIZE_T)~(gsCwq.uiLineSize - 1))) >> gsCwq.uiLineShift;
#endif
}

static INLINE PVRSRV_ERROR CacheOpValidateVAOffset(PMR *psPMR,
												IMG_CPU_VIRTADDR pvAddress,
												IMG_DEVMEM_OFFSET_T uiOffset,
												IMG_DEVMEM_SIZE_T uiSize,
												void **ppvOutAddress)
{
	PVRSRV_ERROR eError = PVRSRV_OK;
#if defined(LINUX) && !defined(CACHEFLUSH_NO_KMRBF_USING_UMVA)
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
#endif
	void __user *pvAddr;

	if (! pvAddress)
	{
		/* As pvAddress is optional, NULL is expected from UM/KM requests */
		pvAddr = NULL;
		goto e0;
	}

#if !defined(LINUX) || defined(CACHEFLUSH_NO_KMRBF_USING_UMVA)
	pvAddr = NULL;
#else
	/* Validate VA, assume most basic address limit access_ok() check */
	pvAddr = (void __user *)(uintptr_t)((uintptr_t)pvAddress + uiOffset);
	if (!access_ok(pvAddr, uiSize))
	{
		pvAddr = NULL;
		if (! mm)
		{
			/* Bad KM request, don't silently ignore */
			eError = PVRSRV_ERROR_INVALID_CPU_ADDR;
			goto e0;
		}
	}
	else if (mm)
	{
		down_read(&mm->mmap_sem);

		vma = find_vma(mm, (unsigned long)(uintptr_t)pvAddr);
		if (!vma ||
			vma->vm_start > (unsigned long)(uintptr_t)pvAddr ||
			vma->vm_end - vma->vm_start > (unsigned long)(uintptr_t)uiSize)
		{
			/* Out of range mm_struct->vm_area VA */
			pvAddr = NULL;
		}
		else if (vma->vm_private_data != psPMR)
		{
			/*
			   Unknown mm_struct->vm_area VA, can't risk dcache maintenance using
			   this VA as the client user space mapping could be removed without
			   us knowing which might induce CPU fault during cache maintenance.
			*/
			pvAddr = NULL;
		}
		else if ((uintptr_t)pvAddress < (uintptr_t)gsCwq.uiPageSize)
		{
			/* Silently suppress UM NULL page pointers */
			pvAddr = NULL;
		}

		up_read(&mm->mmap_sem);
	}
	else
	{
		pgd_t *pgd;
		p4d_t *p4d;
		pud_t *pud;
		pmd_t *pmd;
		pte_t *ptep;
		mm = current->active_mm;

		/*
			For KM requests perform additional VA validation, so we walk the
			kernel page-table structures to be sure VA is safe to use.
		*/
		pgd = pgd_offset(mm, (uintptr_t)pvAddr);
		if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
		{
			eError = PVRSRV_ERROR_INVALID_CPU_ADDR;
			pvAddr = NULL;
			goto e0;
		}

		p4d = p4d_offset(pgd, (uintptr_t)pvAddr);
		if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
		{
			eError = PVRSRV_ERROR_INVALID_CPU_ADDR;
			pvAddr = NULL;
			goto e0;
		}

		pud = pud_offset(p4d, (uintptr_t)pvAddr);
		if (pud_none(*pud) || unlikely(pud_bad(*pud)))
		{
			eError = PVRSRV_ERROR_INVALID_CPU_ADDR;
			pvAddr = NULL;
			goto e0;
		}

		pmd = pmd_offset(pud, (uintptr_t)pvAddr);
		if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
		{
			eError = PVRSRV_ERROR_INVALID_CPU_ADDR;
			pvAddr = NULL;
			goto e0;
		}

		ptep = pte_offset_map(pmd, (uintptr_t)pvAddr);
		if (!ptep || !pte_present(*ptep))
		{
			eError = PVRSRV_ERROR_INVALID_CPU_ADDR;
			pvAddr = NULL;
			goto e0;
		}
	}
#endif

e0:
	*ppvOutAddress = (IMG_CPU_VIRTADDR __force) pvAddr;
	return eError;
}

static PVRSRV_ERROR CacheOpPMRExec (PMR *psPMR,
									IMG_CPU_VIRTADDR pvAddress,
									IMG_DEVMEM_OFFSET_T uiOffset,
									IMG_DEVMEM_SIZE_T uiSize,
									PVRSRV_CACHE_OP uiCacheOp,
									IMG_UINT32 ui32GFlushSeqNum,
									IMG_BOOL bIsRequestValidated,
									IMG_BOOL *pbUsedGlobalFlush)
{
	IMG_HANDLE hPrivOut;
	IMG_BOOL bPMRIsSparse;
	IMG_UINT32 ui32PageIndex;
	IMG_UINT32 ui32NumOfPages;
	IMG_DEVMEM_SIZE_T uiOutSize;
	PVRSRV_DEVICE_NODE *psDevNode;
	IMG_DEVMEM_SIZE_T uiPgAlignedSize;
	IMG_DEVMEM_OFFSET_T uiPgAlignedOffset;
	IMG_DEVMEM_OFFSET_T uiCLAlignedEndOffset;
	IMG_DEVMEM_OFFSET_T uiPgAlignedEndOffset;
	IMG_DEVMEM_OFFSET_T uiCLAlignedStartOffset;
	IMG_DEVMEM_OFFSET_T uiPgAlignedStartOffset;
	IMG_BOOL abValid[PMR_MAX_TRANSLATION_STACK_ALLOC];
	IMG_CPU_PHYADDR asCpuPhyAddr[PMR_MAX_TRANSLATION_STACK_ALLOC];
	IMG_CPU_PHYADDR *psCpuPhyAddr = asCpuPhyAddr;
	IMG_BOOL bIsPMRInfoValid = IMG_FALSE;
	PVRSRV_ERROR eError = PVRSRV_OK;
	IMG_BYTE *pbCpuVirtAddr = NULL;
	IMG_BOOL *pbValid = abValid;
	*pbUsedGlobalFlush = IMG_FALSE;

	if (uiCacheOp == PVRSRV_CACHE_OP_NONE || uiCacheOp == PVRSRV_CACHE_OP_TIMELINE)
	{
		return PVRSRV_OK;
	}

	/* Check for explicitly requested-for KGF or KRBF promoted to KGF requests */
	if (uiCacheOp == PVRSRV_CACHE_OP_GLOBAL || uiSize == 0 ||
	    (IMG_UINT32)uiSize > gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD])
	{
		/* Discard if an else-when KGF has occurred in the interim time */
		if (gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] > ui32GFlushSeqNum)
		{
#if defined(CACHEOP_DEBUG)
			gsCwq.ui32KMDiscards += 1;
#endif
			return PVRSRV_OK;
		}
		/* Some CPU ISA support KGF, if it fails fall-back to KRBF */
		else if ((eError = CacheOpGlobalFlush()) == PVRSRV_OK)
		{
			*pbUsedGlobalFlush = IMG_TRUE;
			return PVRSRV_OK;
		}
		/* Request with uiSize=0 is treated as a KGF request as well */
		else if (uiCacheOp == PVRSRV_CACHE_OP_GLOBAL || uiSize == 0)
		{
			/* Cannot fall-back to KRBF as an explicit KGF was erroneously requested for */
			PVR_LOGR_IF_ERROR(eError, CACHEOP_NO_GFLUSH_ERROR_STRING);
			CACHEOP_PVR_ASSERT(0);
		}
	}

	if (! bIsRequestValidated)
	{
		IMG_DEVMEM_SIZE_T uiLPhysicalSize;

		/* Need to validate parameters before proceeding */
		eError = PMR_PhysicalSize(psPMR, &uiLPhysicalSize);
		PVR_LOGR_IF_ERROR(eError, "uiLPhysicalSize");

		PVR_LOGR_IF_FALSE(((uiOffset+uiSize) <= uiLPhysicalSize), CACHEOP_DEVMEM_OOR_ERROR_STRING, PVRSRV_ERROR_DEVICEMEM_OUT_OF_RANGE);

		eError = PMRLockSysPhysAddresses(psPMR);
		PVR_LOGR_IF_ERROR(eError, "PMRLockSysPhysAddresses");
	}

	/* Fast track the request if a CPU VA is provided and CPU ISA supports VA only maintenance */
	eError = CacheOpValidateVAOffset(psPMR, pvAddress, uiOffset, uiSize, (void**)&pbCpuVirtAddr);
	if (eError == PVRSRV_OK)
	{
		pvAddress = pbCpuVirtAddr;

		if (pvAddress && gsCwq.uiCacheOpAddrType == PVRSRV_CACHE_OP_ADDR_TYPE_VIRTUAL)
		{
			CacheOpExecRangeBasedVA(PMR_DeviceNode(psPMR), pvAddress, uiSize, uiCacheOp);
			if (! bIsRequestValidated)
			{
				eError = PMRUnlockSysPhysAddresses(psPMR);
				PVR_LOG_IF_ERROR(eError, "PMRUnlockSysPhysAddresses");
			}
#if defined(CACHEOP_DEBUG)
			gsCwq.ui32ServerSyncVA += 1;
#endif
			return PVRSRV_OK;
		}
		else if (pvAddress)
		{
			/* Round down the incoming VA (if any) down to the nearest page aligned VA */
			 pvAddress = (void*)((uintptr_t)pvAddress & ~((uintptr_t)gsCwq.uiPageSize-1));
#if defined(CACHEOP_DEBUG)
			gsCwq.ui32ServerSyncVA += 1;
#endif
		}
	}
	else
	{
		/*
		 * This validation pathway has been added to accommodate any/all requests that might
		 * cause the kernel to Oops; essentially, KM requests should prevalidate cache maint.
		 * parameters but if this fails then we would rather fail gracefully than cause the
		 * kernel to Oops so instead we log the fact that an invalid KM virtual address was
		 * supplied and what action was taken to mitigate against kernel Oops(ing) if any.
		 */
		CACHEOP_PVR_ASSERT(pbCpuVirtAddr == NULL);

		if (gsCwq.uiCacheOpAddrType == PVRSRV_CACHE_OP_ADDR_TYPE_PHYSICAL)
		{
			PVR_DPF((PVR_DBG_WARNING,
					"%s: Invalid vaddress 0x%p in CPU d-cache maint. op, using paddress",
					__func__,
					pvAddress));

			/* We can still proceed as kernel/cpu uses CPU PA for d-cache maintenance */
			pvAddress = NULL;
		}
		else if (CacheOpGlobalFlush() == PVRSRV_OK)
		{
			PVR_DPF((PVR_DBG_WARNING,
					"%s: Invalid vaddress 0x%p in CPU d-cache maint. op, used global flush",
					__func__,
					pvAddress));

			/* Saved by global flush impl. */
			*pbUsedGlobalFlush = IMG_TRUE;
			eError = PVRSRV_OK;
			goto e0;
		}
		else
		{
			/*
			 * The approach here is to attempt a reacquisition of the PMR kernel VA and see if
			 * said VA corresponds to the parameter VA, if so fail requested cache maint. op.
			 * cause this indicates some kind of internal, memory and/or meta-data corruption
			 * else we reissue the request using this (re)acquired alias PMR kernel VA.
			 */
			if (PMR_IsSparse(psPMR))
			{
				eError = PMRAcquireSparseKernelMappingData(psPMR,
														   0,
														   gsCwq.uiPageSize,
														   (void **)&pbCpuVirtAddr,
														   (size_t*)&uiOutSize,
														   &hPrivOut);
				PVR_LOGG_IF_ERROR(eError, "PMRAcquireSparseKernelMappingData", e0);
			}
			else
			{
				eError = PMRAcquireKernelMappingData(psPMR,
													 0,
													 gsCwq.uiPageSize,
													 (void **)&pbCpuVirtAddr,
													 (size_t*)&uiOutSize,
													 &hPrivOut);
				PVR_LOGG_IF_ERROR(eError, "PMRAcquireKernelMappingData", e0);
			}

			/* Here, we only compare these CPU virtual addresses at granularity of the OS page size */
			if ((uintptr_t)pbCpuVirtAddr == ((uintptr_t)pvAddress & ~((uintptr_t)gsCwq.uiPageSize-1)))
			{
				PVR_DPF((PVR_DBG_ERROR,
						"%s: Invalid vaddress 0x%p in CPU d-cache maint. op, no alt. so failing request",
						__func__,
						pvAddress));

				eError = PMRReleaseKernelMappingData(psPMR, hPrivOut);
				PVR_LOG_IF_ERROR(eError, "PMRReleaseKernelMappingData");

				eError = PVRSRV_ERROR_INVALID_CPU_ADDR;
				goto e0;
			}
			else if (gsCwq.uiCacheOpAddrType == PVRSRV_CACHE_OP_ADDR_TYPE_VIRTUAL)
			{
				PVR_DPF((PVR_DBG_WARNING,
						"%s: Bad vaddress 0x%p in CPU d-cache maint. op, using reacquired vaddress 0x%p",
						__func__,
						pvAddress,
						pbCpuVirtAddr));

				/* Note that this might still fail if there is kernel memory/meta-data corruption;
				   there is not much we can do here but at the least we will be informed of this
				   before the kernel Oops(ing) */
				CacheOpExecRangeBasedVA(PMR_DeviceNode(psPMR), pbCpuVirtAddr, uiSize, uiCacheOp);

				eError = PMRReleaseKernelMappingData(psPMR, hPrivOut);
				PVR_LOG_IF_ERROR(eError, "PMRReleaseKernelMappingData");

				eError = PVRSRV_OK;
				goto e0;
			}
			else
			{
				/* At this junction, we have exhausted every possible work-around possible but we do
				   know that VA reacquisition returned another/alias page-aligned VA; so with this
				   future expectation of PMRAcquireKernelMappingData(), we proceed */
				PVR_DPF((PVR_DBG_WARNING,
						"%s: Bad vaddress %p in CPU d-cache maint. op, will use reacquired vaddress",
						__func__,
						pvAddress));

				eError = PMRReleaseKernelMappingData(psPMR, hPrivOut);
				PVR_LOG_IF_ERROR(eError, "PMRReleaseKernelMappingData");

				/* NULL this to force per-page reacquisition down-stream */
				pvAddress = NULL;
			}
		}
	}

	/* NULL clobbered var., OK to proceed */
	pbCpuVirtAddr = NULL;
	eError = PVRSRV_OK;

	/* Need this for kernel mapping */
	bPMRIsSparse = PMR_IsSparse(psPMR);
	psDevNode = PMR_DeviceNode(psPMR);

	/* Round the incoming offset down to the nearest cache-line / page aligned-address */
	uiCLAlignedEndOffset = uiOffset + uiSize;
	uiCLAlignedEndOffset = PVR_ALIGN(uiCLAlignedEndOffset, (IMG_DEVMEM_SIZE_T)gsCwq.uiLineSize);
	uiCLAlignedStartOffset = (uiOffset & ~((IMG_DEVMEM_OFFSET_T)gsCwq.uiLineSize-1));

	uiPgAlignedEndOffset = uiCLAlignedEndOffset;
	uiPgAlignedEndOffset = PVR_ALIGN(uiPgAlignedEndOffset, (IMG_DEVMEM_SIZE_T)gsCwq.uiPageSize);
	uiPgAlignedStartOffset = (uiOffset & ~((IMG_DEVMEM_OFFSET_T)gsCwq.uiPageSize-1));
	uiPgAlignedSize = uiPgAlignedEndOffset - uiPgAlignedStartOffset;

#if defined(CACHEOP_NO_CACHE_LINE_ALIGNED_ROUNDING)
	/* For internal debug if cache-line optimised
	   flushing is suspected of causing data corruption */
	uiCLAlignedStartOffset = uiPgAlignedStartOffset;
	uiCLAlignedEndOffset = uiPgAlignedEndOffset;
#endif

	/* Type of allocation backing the PMR data */
	ui32NumOfPages = uiPgAlignedSize >> gsCwq.uiPageShift;
	if (ui32NumOfPages > PMR_MAX_TRANSLATION_STACK_ALLOC)
	{
		/* The pbValid array is allocated first as it is needed in
		   both physical/virtual cache maintenance methods */
		pbValid = OSAllocZMem(ui32NumOfPages * sizeof(IMG_BOOL));
		if (! pbValid)
		{
			pbValid = abValid;
		}
		else if (gsCwq.uiCacheOpAddrType != PVRSRV_CACHE_OP_ADDR_TYPE_VIRTUAL)
		{
			psCpuPhyAddr = OSAllocZMem(ui32NumOfPages * sizeof(IMG_CPU_PHYADDR));
			if (! psCpuPhyAddr)
			{
				psCpuPhyAddr = asCpuPhyAddr;
				OSFreeMem(pbValid);
				pbValid = abValid;
			}
		}
	}

	/* We always retrieve PMR data in bulk, up-front if number of pages is within
	   PMR_MAX_TRANSLATION_STACK_ALLOC limits else we check to ensure that a
	   dynamic buffer has been allocated to satisfy requests outside limits */
	if (ui32NumOfPages <= PMR_MAX_TRANSLATION_STACK_ALLOC || pbValid != abValid)
	{
		if (gsCwq.uiCacheOpAddrType != PVRSRV_CACHE_OP_ADDR_TYPE_VIRTUAL)
		{
			/* Look-up PMR CpuPhyAddr once, if possible */
			eError = PMR_CpuPhysAddr(psPMR,
									 gsCwq.uiPageShift,
									 ui32NumOfPages,
									 uiPgAlignedStartOffset,
									 psCpuPhyAddr,
									 pbValid);
			if (eError == PVRSRV_OK)
			{
				bIsPMRInfoValid = IMG_TRUE;
			}
		}
		else
		{
			/* Look-up PMR per-page validity once, if possible */
			eError = PMR_IsOffsetValid(psPMR,
									   gsCwq.uiPageShift,
									   ui32NumOfPages,
									   uiPgAlignedStartOffset,
									   pbValid);
			bIsPMRInfoValid = (eError == PVRSRV_OK) ? IMG_TRUE : IMG_FALSE;
		}
	}

	/* For each (possibly non-contiguous) PMR page(s), carry out the requested cache maint. op. */
	for (uiPgAlignedOffset = uiPgAlignedStartOffset, ui32PageIndex = 0;
		 uiPgAlignedOffset < uiPgAlignedEndOffset;
		 uiPgAlignedOffset += (IMG_DEVMEM_OFFSET_T) gsCwq.uiPageSize, ui32PageIndex += 1)
	{
		/* Just before issuing the CacheOp RBF, check if it can be discarded */
		if (gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] > ui32GFlushSeqNum)
		{
#if defined(CACHEOP_DEBUG)
			gsCwq.ui32KMDiscards += 1;
#endif
			break;
		}

		if (! bIsPMRInfoValid)
		{
			/* Never cross page boundary without looking up corresponding PMR page physical
			   address and/or page validity if these were not looked-up, in bulk, up-front */
			ui32PageIndex = 0;
			if (gsCwq.uiCacheOpAddrType != PVRSRV_CACHE_OP_ADDR_TYPE_VIRTUAL)
			{
				eError = PMR_CpuPhysAddr(psPMR,
										 gsCwq.uiPageShift,
										 1,
										 uiPgAlignedOffset,
										 psCpuPhyAddr,
										 pbValid);
				PVR_LOGG_IF_ERROR(eError, "PMR_CpuPhysAddr", e0);
			}
			else
			{
				eError = PMR_IsOffsetValid(psPMR,
										  gsCwq.uiPageShift,
										  1,
										  uiPgAlignedOffset,
										  pbValid);
				PVR_LOGG_IF_ERROR(eError, "PMR_IsOffsetValid", e0);
			}
		}

		/* Skip invalid PMR pages (i.e. sparse) */
		if (pbValid[ui32PageIndex] == IMG_FALSE)
		{
			CACHEOP_PVR_ASSERT(bPMRIsSparse);
			continue;
		}

		if (pvAddress)
		{
			/* The caller has supplied either a KM/UM CpuVA, so use it unconditionally */
			pbCpuVirtAddr =
				(void *)(uintptr_t)((uintptr_t)pvAddress + (uintptr_t)(uiPgAlignedOffset-uiPgAlignedStartOffset));
		}
		/* Skip CpuVA acquire if CacheOp can be maintained entirely using CpuPA */
		else if (gsCwq.uiCacheOpAddrType != PVRSRV_CACHE_OP_ADDR_TYPE_PHYSICAL)
		{
			if (bPMRIsSparse)
			{
				eError =
					PMRAcquireSparseKernelMappingData(psPMR,
													  uiPgAlignedOffset,
													  gsCwq.uiPageSize,
													  (void **)&pbCpuVirtAddr,
													  (size_t*)&uiOutSize,
													  &hPrivOut);
				PVR_LOGG_IF_ERROR(eError, "PMRAcquireSparseKernelMappingData", e0);
			}
			else
			{
				eError =
					PMRAcquireKernelMappingData(psPMR,
												uiPgAlignedOffset,
												gsCwq.uiPageSize,
												(void **)&pbCpuVirtAddr,
												(size_t*)&uiOutSize,
												&hPrivOut);
				PVR_LOGG_IF_ERROR(eError, "PMRAcquireKernelMappingData", e0);
			}
		}

		/* Issue actual cache maintenance for PMR */
		CacheOpExecRangeBased(psDevNode,
							uiCacheOp,
							pbCpuVirtAddr,
							(gsCwq.uiCacheOpAddrType != PVRSRV_CACHE_OP_ADDR_TYPE_VIRTUAL) ?
								psCpuPhyAddr[ui32PageIndex] : psCpuPhyAddr[0],
							uiPgAlignedOffset,
							uiCLAlignedStartOffset,
							uiCLAlignedEndOffset);

		if (! pvAddress)
		{
			/* The caller has not supplied either a KM/UM CpuVA, release mapping */
			if (gsCwq.uiCacheOpAddrType != PVRSRV_CACHE_OP_ADDR_TYPE_PHYSICAL)
			{
				eError = PMRReleaseKernelMappingData(psPMR, hPrivOut);
				PVR_LOG_IF_ERROR(eError, "PMRReleaseKernelMappingData");
			}
		}
	}

e0:
	if (psCpuPhyAddr != asCpuPhyAddr)
	{
		OSFreeMem(psCpuPhyAddr);
	}

	if (pbValid != abValid)
	{
		OSFreeMem(pbValid);
	}

	if (! bIsRequestValidated)
	{
		eError = PMRUnlockSysPhysAddresses(psPMR);
		PVR_LOG_IF_ERROR(eError, "PMRUnlockSysPhysAddresses");
	}

	return eError;
}

static PVRSRV_ERROR CacheOpQListExecGlobal(void)
{
	PVRSRV_ERROR eError = PVRSRV_OK;
	IMG_UINT32 ui32NumOfEntries;
	CACHEOP_WORK_ITEM *psCacheOpWorkItem;
#if defined(CACHEOP_DEBUG)
	IMG_UINT64 uiTimeNow = 0;
	IMG_UINT64 ui64DequeuedTime;
	CACHEOP_WORK_ITEM sCacheOpWorkItem = {0};
#endif
	CACHEOP_PVR_ASSERT(!gsCwq.bNoGlobalFlushImpl);

	/* Take the current snapshot of queued CacheOps before we issue a global cache
	   flush operation so that we retire the right amount of CacheOps that has
	   been affected by the to-be executed global CacheOp */
	ui32NumOfEntries = CacheOpIdxSpan(&gsCwq.hWriteCounter, &gsCwq.hReadCounter);
	if (OSAtomicRead(&gsCwq.hWriteCounter) < OSAtomicRead(&gsCwq.hReadCounter))
	{
		/* Branch handles when the write-counter has wrapped-around in value space.
		   The logic here works seeing the read-counter does not change value for
		   the duration of this function so we don't run the risk of it too wrapping
		   round whilst the number of entries is being determined here, that is to
		   say, the consumer in this framework is single threaded and this function
		   is that consumer thread */
		ui32NumOfEntries = CacheOpIdxSpan(&gsCwq.hReadCounter, &gsCwq.hWriteCounter);

		/* Two's complement arithmetic gives the number of entries */
		ui32NumOfEntries = CACHEOP_INDICES_MAX - ui32NumOfEntries;
	}
	if (! ui32NumOfEntries)
	{
		return PVRSRV_OK;
	}
#if defined(CACHEOP_DEBUG)
	CACHEOP_PVR_ASSERT(ui32NumOfEntries < CACHEOP_INDICES_MAX);
#endif

	/* Use the current/latest queue-tail work-item's GF/SeqNum to predicate GF */
	psCacheOpWorkItem = &gsCwq.asWorkItems[CacheOpIdxRead(&gsCwq.hWriteCounter)];
	CacheOpQItemReadCheck(psCacheOpWorkItem);
#if defined(CACHEOP_DEBUG)
	/* The time waiting in the queue to be serviced */
	ui64DequeuedTime = OSClockns64();
#endif

	/* Check if items between [hRead/hWrite]Counter can be discarded before issuing GF */
	if (gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] > psCacheOpWorkItem->ui32GFSeqNum)
	{
		/* The currently discarded CacheOp item updates gsCwq.hCompletedSeqNum */
		OSAtomicWrite(&gsCwq.hCompletedSeqNum, psCacheOpWorkItem->ui32OpSeqNum);
#if defined(CACHEOP_DEBUG)
		gsCwq.ui32KMDiscards += ui32NumOfEntries;
#endif
	}
	else
	{
		eError = CacheOpGlobalFlush();
		PVR_LOGR_IF_ERROR(eError, "CacheOpGlobalFlush");
#if defined(CACHEOP_DEBUG)
		uiTimeNow = OSClockns64();
		sCacheOpWorkItem.bDeferred = IMG_TRUE;
		sCacheOpWorkItem.ui64ExecuteTime = uiTimeNow;
		sCacheOpWorkItem.psPMR = gsCwq.psInfoPagePMR;
		sCacheOpWorkItem.pid = OSGetCurrentProcessID();
		sCacheOpWorkItem.uiCacheOp = PVRSRV_CACHE_OP_GLOBAL;
		sCacheOpWorkItem.ui64DequeuedTime = ui64DequeuedTime;
		sCacheOpWorkItem.ui64EnqueuedTime = psCacheOpWorkItem->ui64EnqueuedTime;
		sCacheOpWorkItem.ui32OpSeqNum = gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0];
#endif
	}

	while (ui32NumOfEntries)
	{
		psCacheOpWorkItem = &gsCwq.asWorkItems[CacheOpIdxNext(&gsCwq.hReadCounter)];
		CacheOpQItemReadCheck(psCacheOpWorkItem);

#if defined(CACHEOP_DEBUG)
		if (psCacheOpWorkItem->uiCacheOp != PVRSRV_CACHE_OP_GLOBAL)
		{
			psCacheOpWorkItem->bRBF = IMG_FALSE;
			if (! uiTimeNow)
			{
				/* Measure deferred queueing overhead only */
				uiTimeNow = OSClockns64();
				psCacheOpWorkItem->ui64ExecuteTime = uiTimeNow;
			}
			else
			{
				psCacheOpWorkItem->ui64ExecuteTime = uiTimeNow;
			}
			psCacheOpWorkItem->ui64DequeuedTime = ui64DequeuedTime;
			CacheOpStatsExecLogWrite(psCacheOpWorkItem);
		}
		/* Something's gone horribly wrong if these 2 counters are identical at this point */
		CACHEOP_PVR_ASSERT(OSAtomicRead(&gsCwq.hWriteCounter) != OSAtomicRead(&gsCwq.hReadCounter));
#endif

		/* If CacheOp is timeline(d), notify timeline waiters */
		eError = CacheOpTimelineExec(psCacheOpWorkItem);
		PVR_LOG_IF_ERROR(eError, "CacheOpTimelineExec");

		/* Mark index as ready for recycling for next CacheOp */
		CacheOpQItemRecycle(psCacheOpWorkItem);
		(void) CacheOpIdxIncrement(&gsCwq.hReadCounter);
		ui32NumOfEntries = ui32NumOfEntries - 1;
	}

#if defined(CACHEOP_DEBUG)
	if (uiTimeNow)
	{
		/* Only log GF that was actually executed */
		CacheOpStatsExecLogWrite(&sCacheOpWorkItem);
	}
#endif

	return eError;
}

static PVRSRV_ERROR CacheOpQListExecRangeBased(void)
{
	IMG_UINT32 ui32NumOfEntries;
	PVRSRV_ERROR eError = PVRSRV_OK;
	IMG_UINT32 ui32WriteCounter = ~0;
	IMG_BOOL bUsedGlobalFlush = IMG_FALSE;
	CACHEOP_WORK_ITEM *psCacheOpWorkItem = NULL;
#if defined(CACHEOP_DEBUG)
	IMG_UINT64 uiTimeNow = 0;
#endif

	/* Take a snapshot of the current count of deferred entries at this junction */
	ui32NumOfEntries = CacheOpIdxSpan(&gsCwq.hWriteCounter, &gsCwq.hReadCounter);
	if (! ui32NumOfEntries)
	{
		return PVRSRV_OK;
	}
#if defined(CACHEOP_DEBUG)
	CACHEOP_PVR_ASSERT(ui32NumOfEntries < CACHEOP_INDICES_MAX);
#endif

	while (ui32NumOfEntries)
	{
		if (! OSAtomicRead(&gsCwq.hReadCounter))
		{
			/* Normally, the read-counter will trail the write counter until the write
			   counter wraps-round to zero. Under this condition we (re)calculate as the
			   read-counter too is wrapping around at this point */
			ui32NumOfEntries = CacheOpIdxSpan(&gsCwq.hWriteCounter, &gsCwq.hReadCounter);
		}
#if defined(CACHEOP_DEBUG)
		/* Something's gone horribly wrong if these 2 counters are identical at this point */
		CACHEOP_PVR_ASSERT(OSAtomicRead(&gsCwq.hWriteCounter) != OSAtomicRead(&gsCwq.hReadCounter));
#endif

		/* Select the next pending deferred work-item for RBF cache maintenance */
		psCacheOpWorkItem = &gsCwq.asWorkItems[CacheOpIdxNext(&gsCwq.hReadCounter)];
		CacheOpQItemReadCheck(psCacheOpWorkItem);
#if defined(CACHEOP_DEBUG)
		/* The time waiting in the queue to be serviced */
		psCacheOpWorkItem->ui64DequeuedTime = OSClockns64();
#endif

		/* The following CacheOpPMRExec() could trigger a GF, so we (re)read this
		   counter just in case so that we know all such pending CacheOp(s) that will
		   benefit from this soon-to-be-executed GF */
		ui32WriteCounter = CacheOpConfigSupports(CACHEOP_CONFIG_KGF) ?
								OSAtomicRead(&gsCwq.hWriteCounter) : ui32WriteCounter;

		eError = CacheOpPMRExec(psCacheOpWorkItem->psPMR,
								NULL, /* No UM virtual address */
								psCacheOpWorkItem->uiOffset,
								psCacheOpWorkItem->uiSize,
								psCacheOpWorkItem->uiCacheOp,
								psCacheOpWorkItem->ui32GFSeqNum,
								IMG_TRUE, /* PMR is pre-validated */
								&bUsedGlobalFlush);
		if (eError != PVRSRV_OK)
		{
#if defined(CACHEOP_DEBUG)
#define PID_FMTSPEC " PID:%u"
#define CACHE_OP_WORK_PID psCacheOpWorkItem->pid
#else
#define PID_FMTSPEC "%s"
#define CACHE_OP_WORK_PID ""
#endif

			PVR_LOG(("Deferred CacheOpPMRExec failed:"
					 PID_FMTSPEC
					 " PMR:%p"
					 " Offset:%" IMG_UINT64_FMTSPECX
					 " Size:%" IMG_UINT64_FMTSPECX
					 " CacheOp:%d,"
					 " error: %d",
					CACHE_OP_WORK_PID,
					psCacheOpWorkItem->psPMR,
					psCacheOpWorkItem->uiOffset,
					psCacheOpWorkItem->uiSize,
					psCacheOpWorkItem->uiCacheOp,
					eError));

#undef PID_FMTSPEC
#undef CACHE_OP_WORK_PID
		}
		else if (bUsedGlobalFlush)
		{
#if defined(CACHEOP_DEBUG)
			psCacheOpWorkItem->ui32OpSeqNum = gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0];
#endif
			break;
		}

#if defined(CACHEOP_DEBUG)
		if (psCacheOpWorkItem->uiCacheOp != PVRSRV_CACHE_OP_GLOBAL)
		{
			psCacheOpWorkItem->bRBF = IMG_TRUE;
			psCacheOpWorkItem->ui64ExecuteTime = OSClockns64();
			CacheOpStatsExecLogWrite(psCacheOpWorkItem);
		}
		else
		{
			CACHEOP_PVR_ASSERT(!gsCwq.bNoGlobalFlushImpl);
		}
#endif

		/* The currently executed CacheOp item updates gsCwq.hCompletedSeqNum.
		   NOTE: This CacheOp item might be a discard item, if so its seqNum
		   still updates the gsCwq.hCompletedSeqNum */
		OSAtomicWrite(&gsCwq.hCompletedSeqNum, psCacheOpWorkItem->ui32OpSeqNum);
		OSAtomicSubtract(&gsCwq.hDeferredSize, psCacheOpWorkItem->uiSize);

		/* If CacheOp is timeline(d), notify timeline waiters */
		eError = CacheOpTimelineExec(psCacheOpWorkItem);
		PVR_LOG_IF_ERROR(eError, "CacheOpTimelineExec");

		/* Indicate that this CCB work-item slot is now free for (re)use */
		CacheOpQItemRecycle(psCacheOpWorkItem);
		(void) CacheOpIdxIncrement(&gsCwq.hReadCounter);
		ui32NumOfEntries = ui32NumOfEntries - 1;
	}

	if (bUsedGlobalFlush)
	{
#if defined(CACHEOP_DEBUG)
		uiTimeNow = OSClockns64();
		CACHEOP_PVR_ASSERT(OSAtomicRead(&gsCwq.hWriteCounter) != OSAtomicRead(&gsCwq.hReadCounter));
#endif

		/* Snapshot of queued CacheOps before the global cache flush was issued */
		ui32NumOfEntries = ui32WriteCounter - OSAtomicRead(&gsCwq.hReadCounter);
		if (ui32WriteCounter < OSAtomicRead(&gsCwq.hReadCounter))
		{
			/* Branch handles when the write-counter has wrapped-around in value space */
			ui32NumOfEntries = OSAtomicRead(&gsCwq.hReadCounter) - ui32WriteCounter;
			ui32NumOfEntries = CACHEOP_INDICES_MAX - ui32NumOfEntries;
		}

		while (ui32NumOfEntries)
		{
			CacheOpQItemReadCheck(psCacheOpWorkItem);

#if defined(CACHEOP_DEBUG)
			psCacheOpWorkItem->bRBF = IMG_FALSE;
			psCacheOpWorkItem->ui64ExecuteTime = uiTimeNow;
			if (psCacheOpWorkItem->uiCacheOp == PVRSRV_CACHE_OP_GLOBAL)
			{
				CACHEOP_PVR_ASSERT(!gsCwq.bNoGlobalFlushImpl);
				psCacheOpWorkItem->pid = OSGetCurrentProcessID();
			}
			CacheOpStatsExecLogWrite(psCacheOpWorkItem);
#endif

			eError = CacheOpTimelineExec(psCacheOpWorkItem);
			PVR_LOG_IF_ERROR(eError, "CacheOpTimelineExec");

			/* Mark index as ready for recycling for next CacheOp */
			CacheOpQItemRecycle(psCacheOpWorkItem);
			(void) CacheOpIdxIncrement(&gsCwq.hReadCounter);
			ui32NumOfEntries = ui32NumOfEntries - 1;
			psCacheOpWorkItem = &gsCwq.asWorkItems[CacheOpIdxNext(&gsCwq.hReadCounter)];
		}
	}

	return eError;
}

static INLINE PVRSRV_ERROR CacheOpQListExec(void)
{
	PVRSRV_ERROR eError;

	if (CacheOpConfigSupports(CACHEOP_CONFIG_KGF) &&
		(!CacheOpConfigSupports(CACHEOP_CONFIG_KRBF)
		 || OSAtomicRead(&gsCwq.hDeferredSize) > gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD]))
	{
		eError = CacheOpQListExecGlobal();
		PVR_LOG_IF_ERROR(eError, "CacheOpQListExecGlobal");
	}
	else
	{
		eError = CacheOpQListExecRangeBased();
		PVR_LOG_IF_ERROR(eError, "CacheOpQListExecRangeBased");
	}

	/* Signal any waiting threads blocked on CacheOp fence checks update
	   completed sequence number to last queue work item */
	eError = OSEventObjectSignal(gsCwq.hClientWakeUpEvtObj);
	PVR_LOG_IF_ERROR(eError, "OSEventObjectSignal");

	return eError;
}

static void CacheOpThread(void *pvData)
{
	PVRSRV_DATA *psPVRSRVData = pvData;
	IMG_HANDLE hOSEvent;
	PVRSRV_ERROR eError;

	/* Open CacheOp thread event object, abort driver if event object open fails */
	eError = OSEventObjectOpen(gsCwq.hThreadWakeUpEvtObj, &hOSEvent);
	PVR_LOG_IF_ERROR(eError, "OSEventObjectOpen");

	/* While driver is in good state & loaded, perform pending cache maintenance */
	while ((psPVRSRVData->eServicesState == PVRSRV_SERVICES_STATE_OK) && gsCwq.bInit)
	{
		/* Sleep-wait here until when signalled for new queued CacheOp work items;
		   when woken-up, drain deferred queue completely before next event-wait */
		(void) OSEventObjectWaitKernel(hOSEvent, CACHEOP_THREAD_WAIT_TIMEOUT);
		while (CacheOpIdxSpan(&gsCwq.hWriteCounter, &gsCwq.hReadCounter))
		{
			eError = CacheOpQListExec();
			PVR_LOG_IF_ERROR(eError, "CacheOpQListExec");
		}
	}

	eError = CacheOpQListExec();
	PVR_LOG_IF_ERROR(eError, "CacheOpQListExec");

	eError = OSEventObjectClose(hOSEvent);
	PVR_LOG_IF_ERROR(eError, "OSEventObjectClose");
}

static PVRSRV_ERROR CacheOpBatchExecTimeline(PVRSRV_DEVICE_NODE *psDevNode,
											 PVRSRV_TIMELINE iTimeline,
											 IMG_BOOL bUsedGlobalFlush,
											 IMG_UINT32 ui32CurrentFenceSeqNum,
											 IMG_UINT32 *pui32NextFenceSeqNum)
{
	PVRSRV_ERROR eError;
	IMG_UINT32 ui32NextIdx;
	CACHEOP_WORK_ITEM sCacheOpWorkItem = { };
	CACHEOP_WORK_ITEM *psCacheOpWorkItem = NULL;

	eError = CacheOpTimelineBind(psDevNode, &sCacheOpWorkItem, iTimeline);
	PVR_LOGR_IF_ERROR(eError, "CacheOpTimelineBind");

	OSLockAcquire(gsCwq.hDeferredLock);

	/*
	   Check if there is any deferred queueing space available and that nothing is
	   currently queued. This second check is required as Android where timelines
	   are used sets a timeline signalling deadline of 1000ms to signal timelines
	   else complains. So seeing we cannot be sure how long the CacheOp presently
	   in the queue would take we should not send this timeline down the queue as
	   well.
	 */
	ui32NextIdx = CacheOpIdxNext(&gsCwq.hWriteCounter);
	if (!CacheOpIdxSpan(&gsCwq.hWriteCounter, &gsCwq.hReadCounter) &&
		CacheOpIdxRead(&gsCwq.hReadCounter) != ui32NextIdx)
	{
		psCacheOpWorkItem = &gsCwq.asWorkItems[ui32NextIdx];
		CacheOpQItemWriteCheck(psCacheOpWorkItem);

		psCacheOpWorkItem->sSWTimelineObj = sCacheOpWorkItem.sSWTimelineObj;
		psCacheOpWorkItem->iTimeline = sCacheOpWorkItem.iTimeline;
		psCacheOpWorkItem->ui32OpSeqNum = CacheOpGetNextCommonSeqNum();
		psCacheOpWorkItem->uiCacheOp = PVRSRV_CACHE_OP_TIMELINE;
		psCacheOpWorkItem->uiOffset = (IMG_DEVMEM_OFFSET_T)0;
		psCacheOpWorkItem->uiSize = (IMG_DEVMEM_SIZE_T)0;
		psCacheOpWorkItem->ui32GFSeqNum = 0;
		psCacheOpWorkItem->psDevNode = psDevNode;
		/* Defer timeline using information page PMR */
		psCacheOpWorkItem->psPMR = gsCwq.psInfoPagePMR;
		eError = PMRLockSysPhysAddresses(psCacheOpWorkItem->psPMR);
		PVR_LOGG_IF_ERROR(eError, "PMRLockSysPhysAddresses", e0);
#if defined(CACHEOP_DEBUG)
		psCacheOpWorkItem->pid = OSGetCurrentClientProcessIDKM();
		psCacheOpWorkItem->ui64EnqueuedTime = OSClockns64();
		gsCwq.ui32ServerASync += 1;
		gsCwq.ui32ServerDTL += 1;
#endif

		/* Mark index ready for cache maintenance */
		(void) CacheOpIdxIncrement(&gsCwq.hWriteCounter);

		OSLockRelease(gsCwq.hDeferredLock);

		/* Signal the CacheOp thread to ensure this GF get processed */
		eError = OSEventObjectSignal(gsCwq.hThreadWakeUpEvtObj);
		PVR_LOG_IF_ERROR(eError, "OSEventObjectSignal");
	}
	else
	{
		IMG_BOOL bExecTimeline = IMG_TRUE;
		IMG_UINT32 ui32CompletedOpSeqNum = OSAtomicRead(&gsCwq.hCompletedSeqNum);

		OSLockRelease(gsCwq.hDeferredLock);

		/*
		   This pathway requires careful handling here as the client CacheOp(s) predicated on this
		   timeline might have been broken-up (i.e. batched) into several server requests by client:
		   1 - In the first case, a CacheOp from an earlier batch is still in-flight, so we check if
		   this is the case because even though we might have executed all the CacheOps in this batch
		   synchronously, we cannot be sure that any in-flight CacheOp pending on this client is not
		   predicated on this timeline hence we need to synchronise here for safety by fencing until
		   all in-flight CacheOps are completed. NOTE: On Android, this might cause issues due to
		   timelines notification deadlines so we do not fence (i.e. cannot sleep or wait) here to
		   synchronise, instead nudge services client to retry the request if there is no GF support.
		   2 - In the second case, there is no in-flight CacheOp for this client in which case just
		   continue processing as normal.
		 */
		if (!bUsedGlobalFlush && !CacheOpFenceCheck(ui32CompletedOpSeqNum, ui32CurrentFenceSeqNum))
		{
#if defined(ANDROID)
			bExecTimeline = IMG_TRUE;
			if (CacheOpGlobalFlush() != PVRSRV_OK)
			{
				bExecTimeline = IMG_FALSE;
				eError = PVRSRV_ERROR_RETRY;
			}
#else
			eError = CacheOpFence ((RGXFWIF_DM)0, ui32CurrentFenceSeqNum);
			PVR_LOG_IF_ERROR(eError, "CacheOpFence");

			/* CacheOpFence() might have triggered a GF so we take advantage of it */
			if (gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] > ui32CurrentFenceSeqNum)
			{
				*pui32NextFenceSeqNum = 0;
			}
#endif
		}

		if (bExecTimeline)
		{
			/* CacheOp fence requirement met, signal timeline */
			eError = CacheOpTimelineExec(&sCacheOpWorkItem);
			PVR_LOG_IF_ERROR(eError, "CacheOpTimelineExec");
		}
	}

	return eError;
e0:
	if (psCacheOpWorkItem)
	{
		/* Need to ensure we leave this CacheOp QItem in the proper recycled state */
		CacheOpQItemRecycle(psCacheOpWorkItem);
		OSLockRelease(gsCwq.hDeferredLock);
	}

	return eError;
}

static PVRSRV_ERROR CacheOpBatchExecRangeBased(PVRSRV_DEVICE_NODE *psDevNode,
											PMR **ppsPMR,
											IMG_CPU_VIRTADDR *pvAddress,
											IMG_DEVMEM_OFFSET_T *puiOffset,
											IMG_DEVMEM_SIZE_T *puiSize,
											PVRSRV_CACHE_OP *puiCacheOp,
											IMG_UINT32 ui32NumCacheOps,
											PVRSRV_TIMELINE uiTimeline,
											IMG_UINT32 ui32GlobalFlushSeqNum,
											IMG_UINT32 uiCurrentFenceSeqNum,
											IMG_UINT32 *pui32NextFenceSeqNum)
{
	IMG_UINT32 ui32Idx;
	IMG_UINT32 ui32NextIdx;
	IMG_BOOL bBatchHasTimeline;
	IMG_BOOL bCacheOpConfigKDF;
	IMG_BOOL bCacheOpConfigKRBF;
	IMG_DEVMEM_SIZE_T uiLogicalSize;
	PVRSRV_ERROR eError = PVRSRV_OK;
	IMG_BOOL bUseGlobalFlush = IMG_FALSE;
	CACHEOP_WORK_ITEM *psCacheOpWorkItem = NULL;
#if defined(CACHEOP_DEBUG)
	CACHEOP_WORK_ITEM sCacheOpWorkItem = {0};
	IMG_UINT32 ui32OpSeqNum = CacheOpGetNextCommonSeqNum();
	sCacheOpWorkItem.pid = OSGetCurrentClientProcessIDKM();
#endif

	/* Check if batch has an associated timeline update */
	bBatchHasTimeline = puiCacheOp[ui32NumCacheOps-1] & PVRSRV_CACHE_OP_TIMELINE;
	puiCacheOp[ui32NumCacheOps-1] &= ~(PVRSRV_CACHE_OP_GLOBAL | PVRSRV_CACHE_OP_TIMELINE);

	/* Check if config. supports kernel deferring of cacheops */
	bCacheOpConfigKDF = CacheOpConfigSupports(CACHEOP_CONFIG_KDF);
	bCacheOpConfigKRBF = CacheOpConfigSupports(CACHEOP_CONFIG_KRBF);

	/*
	   Client expects the next fence seqNum to be zero unless the server has deferred
	   at least one CacheOp in the submitted queue in which case the server informs
	   the client of the last CacheOp seqNum deferred in this batch.
	*/
	for (*pui32NextFenceSeqNum = 0, ui32Idx = 0; ui32Idx < ui32NumCacheOps; ui32Idx++)
	{
		if (! puiSize[ui32Idx])
		{
			/* Fail UM request, don't silently ignore */
			eError = PVRSRV_ERROR_INVALID_PARAMS;
			goto e0;
		}
		else if (bCacheOpConfigKDF)
		{
			/* Check if there is deferred queueing space available */
			ui32NextIdx = CacheOpIdxNext(&gsCwq.hWriteCounter);
			if (ui32NextIdx != CacheOpIdxRead(&gsCwq.hReadCounter))
			{
				psCacheOpWorkItem = &gsCwq.asWorkItems[ui32NextIdx];
			}
		}

		/*
		   Normally, we would like to defer client CacheOp(s) but we may not always be in a
		   position or is necessary to do so based on the following reasons:
		   0 - There is currently no queueing space left to enqueue this CacheOp, this might
		       imply the system is queueing more requests than can be consumed by the CacheOp
		       thread in time.
		   1 - Batch has timeline, action this now due to Android timeline signaling deadlines.
		   2 - Configuration does not support deferring of cache maintenance operations so we
		       execute the batch synchronously/immediately.
		   3 - CacheOp has an INVALIDATE, as this is used to transfer device memory buffer
		       ownership back to the processor, we cannot defer it so action it immediately.
		   4 - CacheOp size too small (single OS page size) to warrant overhead of deferment,
		       this will not be considered if KRBF is not present, as it implies defer all.
		   5 - CacheOp size OK for deferment, but a client virtual address is supplied so we
		       might has well just take advantage of said VA & flush immediately in UM context.
		   6 - Prevent DoS attack if a malicious client queues something very large, say 1GiB
		       and the processor cache ISA does not have a global flush implementation. Here
			   we upper bound this threshold to PVR_DIRTY_BYTES_FLUSH_THRESHOLD.
		   7 - Ensure QoS (load balancing) by not over-loading queue with too much requests,
		       here the (pseudo) alternate queue is the user context so we execute directly
		       on it if the processor cache ISA does not have a global flush implementation.
		*/
		if (!psCacheOpWorkItem  ||
			bBatchHasTimeline   ||
			!bCacheOpConfigKDF  ||
			puiCacheOp[ui32Idx] & PVRSRV_CACHE_OP_INVALIDATE ||
			(bCacheOpConfigKRBF && puiSize[ui32Idx] <= (IMG_DEVMEM_SIZE_T)gsCwq.uiPageSize) ||
			(pvAddress[ui32Idx] && puiSize[ui32Idx] < (IMG_DEVMEM_SIZE_T)gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD]) ||
			(gsCwq.bNoGlobalFlushImpl && puiSize[ui32Idx] >= (IMG_DEVMEM_SIZE_T)(gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD] << 2)) ||
			(gsCwq.bNoGlobalFlushImpl && OSAtomicRead(&gsCwq.hDeferredSize) >= gsCwq.pui32InfoPage[CACHEOP_INFO_KMDFTHRESHLD] << CACHEOP_INDICES_LOG2_SIZE))
		{
			/* When the CacheOp thread not keeping up, trash d-cache */
			bUseGlobalFlush = !psCacheOpWorkItem && bCacheOpConfigKDF ? IMG_TRUE : IMG_FALSE;
#if defined(CACHEOP_DEBUG)
			sCacheOpWorkItem.ui64EnqueuedTime = OSClockns64();
			gsCwq.ui32ServerSync += 1;
#endif
			psCacheOpWorkItem = NULL;

			eError = CacheOpPMRExec(ppsPMR[ui32Idx],
									pvAddress[ui32Idx],
									puiOffset[ui32Idx],
									puiSize[ui32Idx],
									puiCacheOp[ui32Idx],
									ui32GlobalFlushSeqNum,
									IMG_FALSE,
									&bUseGlobalFlush);
			PVR_LOGG_IF_ERROR(eError, "CacheOpExecPMR", e0);

#if defined(CACHEOP_DEBUG)
			sCacheOpWorkItem.ui64ExecuteTime = OSClockns64();
			sCacheOpWorkItem.bRBF = !bUseGlobalFlush;
			sCacheOpWorkItem.ui32OpSeqNum = bUseGlobalFlush ?
				gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] : ui32OpSeqNum;
			sCacheOpWorkItem.psPMR = ppsPMR[ui32Idx];
			sCacheOpWorkItem.uiSize = puiSize[ui32Idx];
			sCacheOpWorkItem.uiOffset = puiOffset[ui32Idx];
			sCacheOpWorkItem.uiCacheOp = puiCacheOp[ui32Idx];
			CacheOpStatsExecLogWrite(&sCacheOpWorkItem);
#endif

			if (bUseGlobalFlush) break;
			continue;
		}

		/* Need to validate request parameters here before enqueing */
		eError = PMR_LogicalSize(ppsPMR[ui32Idx], &uiLogicalSize);
		PVR_LOGG_IF_ERROR(eError, "PMR_LogicalSize", e0);
		eError = PVRSRV_ERROR_DEVICEMEM_OUT_OF_RANGE;
		PVR_LOGG_IF_FALSE(((puiOffset[ui32Idx]+puiSize[ui32Idx]) <= uiLogicalSize), CACHEOP_DEVMEM_OOR_ERROR_STRING, e0);
		eError = PVRSRV_OK;

		/* For safety, take reference here in user context */
		eError = PMRLockSysPhysAddresses(ppsPMR[ui32Idx]);
		PVR_LOGG_IF_ERROR(eError, "PMRLockSysPhysAddresses", e0);

		OSLockAcquire(gsCwq.hDeferredLock);

		/* Select next item off the queue to defer with */
		ui32NextIdx = CacheOpIdxNext(&gsCwq.hWriteCounter);
		if (ui32NextIdx != CacheOpIdxRead(&gsCwq.hReadCounter))
		{
			psCacheOpWorkItem = &gsCwq.asWorkItems[ui32NextIdx];
			CacheOpQItemWriteCheck(psCacheOpWorkItem);
		}
		else
		{
			/* Retry, disable KDF for this batch */
			OSLockRelease(gsCwq.hDeferredLock);
			bCacheOpConfigKDF = IMG_FALSE;
			psCacheOpWorkItem = NULL;
			ui32Idx = ui32Idx - 1;
			continue;
		}

		/* Timeline need to be looked-up (i.e. bind) in the user context
		   before deferring into the CacheOp thread kernel context */
		eError = CacheOpTimelineBind(psDevNode, psCacheOpWorkItem, PVRSRV_NO_TIMELINE);
		PVR_LOGG_IF_ERROR(eError, "CacheOpTimelineBind", e1);

		/* Prepare & enqueue next deferred work item for CacheOp thread */
		psCacheOpWorkItem->ui32OpSeqNum = CacheOpGetNextCommonSeqNum();
		*pui32NextFenceSeqNum = psCacheOpWorkItem->ui32OpSeqNum;
		psCacheOpWorkItem->ui32GFSeqNum = ui32GlobalFlushSeqNum;
		psCacheOpWorkItem->uiCacheOp = puiCacheOp[ui32Idx];
		psCacheOpWorkItem->uiOffset = puiOffset[ui32Idx];
		psCacheOpWorkItem->uiSize = puiSize[ui32Idx];
		psCacheOpWorkItem->psPMR = ppsPMR[ui32Idx];
		psCacheOpWorkItem->psDevNode = psDevNode;
#if defined(CACHEOP_DEBUG)
		psCacheOpWorkItem->ui64EnqueuedTime = OSClockns64();
		psCacheOpWorkItem->pid = sCacheOpWorkItem.pid;
		psCacheOpWorkItem->bDeferred = IMG_TRUE;
		psCacheOpWorkItem->bKMReq = IMG_FALSE;
		psCacheOpWorkItem->bUMF = IMG_FALSE;
		gsCwq.ui32ServerASync += 1;
#endif

		/* Increment deferred size & mark index ready for cache maintenance */
		OSAtomicAdd(&gsCwq.hDeferredSize, (IMG_UINT32)puiSize[ui32Idx]);
		(void) CacheOpIdxIncrement(&gsCwq.hWriteCounter);

		OSLockRelease(gsCwq.hDeferredLock);
		psCacheOpWorkItem = NULL;
	}

	/* Signal the CacheOp thread to ensure these items get processed */
	eError = OSEventObjectSignal(gsCwq.hThreadWakeUpEvtObj);
	PVR_LOG_IF_ERROR(eError, "OSEventObjectSignal");

	if (bUseGlobalFlush)
	{
#if defined(CACHEOP_DEBUG)
		/* GF was logged already in the loop above, so rest if any are discards */
		sCacheOpWorkItem.ui64ExecuteTime = sCacheOpWorkItem.ui64EnqueuedTime;
		sCacheOpWorkItem.pid = OSGetCurrentClientProcessIDKM();
		while (++ui32Idx < ui32NumCacheOps)
		{
			sCacheOpWorkItem.psPMR = ppsPMR[ui32Idx];
			sCacheOpWorkItem.uiSize = puiSize[ui32Idx];
			sCacheOpWorkItem.uiOffset = puiOffset[ui32Idx];
			sCacheOpWorkItem.uiCacheOp = puiCacheOp[ui32Idx];
			CacheOpStatsExecLogWrite(&sCacheOpWorkItem);
			gsCwq.ui32KMDiscards += 1;
		}
#endif

		/* No next UM fence seqNum */
		*pui32NextFenceSeqNum = 0;
	}

e1:
	if (psCacheOpWorkItem)
	{
		/* Need to ensure we leave this CacheOp QItem in the proper recycled state */
		CacheOpQItemRecycle(psCacheOpWorkItem);
		OSLockRelease(gsCwq.hDeferredLock);
	}
e0:
	if (bBatchHasTimeline)
	{
		PVRSRV_ERROR eError2;
		eError2 = CacheOpBatchExecTimeline(psDevNode, uiTimeline, bUseGlobalFlush,
										   uiCurrentFenceSeqNum, pui32NextFenceSeqNum);
		eError = (eError2 == PVRSRV_ERROR_RETRY) ? eError2 : eError;
	}

	return eError;
}

static PVRSRV_ERROR CacheOpBatchExecGlobal(PVRSRV_DEVICE_NODE *psDevNode,
									PMR **ppsPMR,
									IMG_CPU_VIRTADDR *pvAddress,
									IMG_DEVMEM_OFFSET_T *puiOffset,
									IMG_DEVMEM_SIZE_T *puiSize,
									PVRSRV_CACHE_OP *puiCacheOp,
									IMG_UINT32 ui32NumCacheOps,
									PVRSRV_TIMELINE uiTimeline,
									IMG_UINT32 ui32GlobalFlushSeqNum,
									IMG_UINT32 uiCurrentFenceSeqNum,
									IMG_UINT32 *pui32NextFenceSeqNum)
{
	IMG_UINT32 ui32Idx;
	IMG_BOOL bBatchHasTimeline;
	PVRSRV_ERROR eError = PVRSRV_OK;
	IMG_BOOL bUseGlobalFlush = IMG_FALSE;
	CACHEOP_WORK_ITEM *psCacheOpWorkItem = NULL;
#if	defined(CACHEOP_DEBUG)
	IMG_DEVMEM_SIZE_T uiTotalSize = 0;
	CACHEOP_WORK_ITEM sCacheOpWorkItem = {0};
	sCacheOpWorkItem.pid = OSGetCurrentClientProcessIDKM();
#endif
#if !defined(CACHEFLUSH_ISA_SUPPORTS_GLOBAL_FLUSH)
	PVR_LOGR_IF_ERROR(PVRSRV_ERROR_NOT_SUPPORTED, CACHEOP_NO_GFLUSH_ERROR_STRING);
#endif
	PVR_UNREFERENCED_PARAMETER(pvAddress);

	/* Check if batch has an associated timeline update request */
	bBatchHasTimeline = puiCacheOp[ui32NumCacheOps-1] & PVRSRV_CACHE_OP_TIMELINE;
	puiCacheOp[ui32NumCacheOps-1] &= ~(PVRSRV_CACHE_OP_GLOBAL | PVRSRV_CACHE_OP_TIMELINE);

	/* Skip operation if an else-when GF has occurred in the interim time */
	if (gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] > ui32GlobalFlushSeqNum)
	{
#if	defined(CACHEOP_DEBUG)
		sCacheOpWorkItem.ui32OpSeqNum = ui32GlobalFlushSeqNum;
#endif
		bUseGlobalFlush = IMG_TRUE;
		*pui32NextFenceSeqNum = 0;
		goto exec_timeline;
	}

	/* Here we need to check that client batch does not contain an INVALIDATE CacheOp */
	for (*pui32NextFenceSeqNum = 0, ui32Idx = 0; ui32Idx < ui32NumCacheOps; ui32Idx++)
	{
#if	defined(CACHEOP_DEBUG)
		IMG_DEVMEM_SIZE_T uiLogicalSize;
		uiTotalSize += puiSize[ui32Idx];
		/* There is no need to validate request parameters as we are about
		   to issue a GF but this might lead to issues being reproducible
		   in one config but not the other, so valid under debug */
		eError = PMR_LogicalSize(ppsPMR[ui32Idx], &uiLogicalSize);
		PVR_LOGG_IF_ERROR(eError, "PMR_LogicalSize", e0);
		eError = PVRSRV_ERROR_DEVICEMEM_OUT_OF_RANGE;
		PVR_LOGG_IF_FALSE(((puiOffset[ui32Idx]+puiSize[ui32Idx]) <= uiLogicalSize), CACHEOP_DEVMEM_OOR_ERROR_STRING, e0);
		eError = PVRSRV_OK;
#endif
		if (! puiSize[ui32Idx])
		{
			/* Fail UM request, don't silently ignore */
			eError = PVRSRV_ERROR_INVALID_PARAMS;
			goto e0;
		}
		else if (puiCacheOp[ui32Idx] & PVRSRV_CACHE_OP_INVALIDATE)
		{
			/* Invalidates cannot be deferred */
			bUseGlobalFlush = IMG_TRUE;
		}
	}

	OSLockAcquire(gsCwq.hDeferredLock);

	/*
	   Normally, we would like to defer client CacheOp(s) but we may not always be in a
	   position to do so based on the following reasons:
	   0 - Batch has an INVALIDATE, as this is used to transfer device memory buffer
	       ownership back to the processor, we cannot defer it so action it immediately.
	   1 - Configuration does not support deferring of cache maintenance operations so
		   we execute synchronously/immediately.
	   2 - There is currently no queueing space left to enqueue this CacheOp, this might
	       imply the system is queueing more requests that can be consumed by the CacheOp
	       thread in time.
	   3 - Batch has a timeline and there is currently something queued, we cannot defer
	       because currently queued operation(s) might take quite a while to action which
	       might cause a timeline deadline timeout.
	*/
	if (bUseGlobalFlush ||
		!CacheOpConfigSupports(CACHEOP_CONFIG_KDF) ||
		CacheOpIdxNext(&gsCwq.hWriteCounter) == CacheOpIdxRead(&gsCwq.hReadCounter) ||
		(bBatchHasTimeline && CacheOpIdxSpan(&gsCwq.hWriteCounter, &gsCwq.hReadCounter)))

	{
		OSLockRelease(gsCwq.hDeferredLock);
#if	defined(CACHEOP_DEBUG)
		sCacheOpWorkItem.ui32OpSeqNum =	CacheOpGetNextCommonSeqNum();
		sCacheOpWorkItem.ui64EnqueuedTime = OSClockns64();
#endif
		eError = CacheOpGlobalFlush();
		PVR_LOGG_IF_ERROR(eError, "CacheOpGlobalFlush", e0);
		bUseGlobalFlush = IMG_TRUE;
#if	defined(CACHEOP_DEBUG)
		sCacheOpWorkItem.ui64ExecuteTime = OSClockns64();
		gsCwq.ui32ServerSync += 1;
#endif
		goto exec_timeline;
	}

	/* Select next item off queue to defer this GF and possibly timeline with */
	psCacheOpWorkItem = &gsCwq.asWorkItems[CacheOpIdxNext(&gsCwq.hWriteCounter)];
	CacheOpQItemWriteCheck(psCacheOpWorkItem);

	/* Defer the GF using information page PMR */
	psCacheOpWorkItem->psPMR = gsCwq.psInfoPagePMR;
	eError = PMRLockSysPhysAddresses(psCacheOpWorkItem->psPMR);
	PVR_LOGG_IF_ERROR(eError, "PMRLockSysPhysAddresses", e0);

	/* Timeline object has to be looked-up here in user context */
	eError = CacheOpTimelineBind(psDevNode, psCacheOpWorkItem, uiTimeline);
	PVR_LOGG_IF_ERROR(eError, "CacheOpTimelineBind", e0);

	/* Prepare & enqueue next deferred work item for CacheOp thread */
	*pui32NextFenceSeqNum = CacheOpGetNextCommonSeqNum();
	psCacheOpWorkItem->ui32OpSeqNum = *pui32NextFenceSeqNum;
	psCacheOpWorkItem->ui32GFSeqNum = ui32GlobalFlushSeqNum;
	psCacheOpWorkItem->uiCacheOp = PVRSRV_CACHE_OP_GLOBAL;
	psCacheOpWorkItem->uiOffset = (IMG_DEVMEM_OFFSET_T)0;
	psCacheOpWorkItem->uiSize = (IMG_DEVMEM_SIZE_T)0;
#if defined(CACHEOP_DEBUG)
	/* Note client pid & queueing time of deferred GF CacheOp */
	psCacheOpWorkItem->ui64EnqueuedTime = OSClockns64();
	psCacheOpWorkItem->pid = sCacheOpWorkItem.pid;
	OSAtomicAdd(&gsCwq.hDeferredSize, uiTotalSize);
	psCacheOpWorkItem->uiSize = uiTotalSize;
	psCacheOpWorkItem->bDeferred = IMG_TRUE;
	psCacheOpWorkItem->bKMReq = IMG_FALSE;
	psCacheOpWorkItem->bUMF = IMG_FALSE;
	/* Client CacheOp is logged using the deferred seqNum */
	sCacheOpWorkItem.ui32OpSeqNum =	*pui32NextFenceSeqNum;
	sCacheOpWorkItem.ui64EnqueuedTime = psCacheOpWorkItem->ui64EnqueuedTime;
	sCacheOpWorkItem.ui64ExecuteTime = psCacheOpWorkItem->ui64EnqueuedTime;
	/* Update the CacheOp statistics */
	gsCwq.ui32ServerASync += 1;
	gsCwq.ui32ServerDGF += 1;
#endif

	/* Mark index ready for cache maintenance */
	(void) CacheOpIdxIncrement(&gsCwq.hWriteCounter);

	OSLockRelease(gsCwq.hDeferredLock);

	/* Signal CacheOp thread to ensure this GF get processed */
	eError = OSEventObjectSignal(gsCwq.hThreadWakeUpEvtObj);
	PVR_LOG_IF_ERROR(eError, "OSEventObjectSignal");

exec_timeline:
	if (bUseGlobalFlush && bBatchHasTimeline)
	{
		eError = CacheOpBatchExecTimeline(psDevNode, uiTimeline, bUseGlobalFlush,
										  uiCurrentFenceSeqNum, pui32NextFenceSeqNum);
	}

#if	defined(CACHEOP_DEBUG)
	for (ui32Idx = 0; ui32Idx < ui32NumCacheOps; ui32Idx++)
	{
		sCacheOpWorkItem.psPMR = ppsPMR[ui32Idx];
		sCacheOpWorkItem.uiSize = puiSize[ui32Idx];
		sCacheOpWorkItem.uiOffset = puiOffset[ui32Idx];
		sCacheOpWorkItem.uiCacheOp = puiCacheOp[ui32Idx];
		if (bUseGlobalFlush)
		{
			if (sCacheOpWorkItem.ui64ExecuteTime && ui32Idx)
			{
				/* Only first item carries the real execution time, rest are discards */
				sCacheOpWorkItem.ui64EnqueuedTime = sCacheOpWorkItem.ui64ExecuteTime;
			}
			gsCwq.ui32KMDiscards += !sCacheOpWorkItem.ui64ExecuteTime ? 1 : ui32Idx ? 1 : 0;
		}
		CacheOpStatsExecLogWrite(&sCacheOpWorkItem);
	}
#endif

	return eError;
e0:
	if (psCacheOpWorkItem)
	{
		/* Need to ensure we leave this CacheOp QItem in the proper recycled state */
		CacheOpQItemRecycle(psCacheOpWorkItem);
		OSLockRelease(gsCwq.hDeferredLock);
	}

	if (bBatchHasTimeline)
	{
		PVRSRV_ERROR eError2;
		eError2 = CacheOpBatchExecTimeline(psDevNode, uiTimeline, IMG_FALSE,
										   uiCurrentFenceSeqNum, pui32NextFenceSeqNum);
		eError = (eError2 == PVRSRV_ERROR_RETRY) ? eError2 : eError;
	}

	return eError;
}

PVRSRV_ERROR CacheOpExec (PPVRSRV_DEVICE_NODE psDevNode,
						  void *pvVirtStart,
						  void *pvVirtEnd,
						  IMG_CPU_PHYADDR sCPUPhysStart,
						  IMG_CPU_PHYADDR sCPUPhysEnd,
						  PVRSRV_CACHE_OP uiCacheOp)
{
	PVRSRV_ERROR eError = PVRSRV_ERROR_RETRY;
#if	defined(CACHEOP_DEBUG)
	IMG_BOOL bUsedGlobalFlush = IMG_FALSE;
	CACHEOP_WORK_ITEM sCacheOpWorkItem = {0};
	sCacheOpWorkItem.ui64EnqueuedTime = OSClockns64();
#endif

	if (gsCwq.bInit)
	{
		IMG_DEVMEM_SIZE_T uiSize = sCPUPhysEnd.uiAddr - sCPUPhysStart.uiAddr;
		if ((IMG_UINT32)uiSize > gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD])
		{
			eError = CacheOpGlobalFlush();
		}
	}

	if (eError == PVRSRV_OK)
	{
#if	defined(CACHEOP_DEBUG)
		bUsedGlobalFlush = IMG_TRUE;
#endif
	}
	else
	{
		switch (uiCacheOp)
		{
			case PVRSRV_CACHE_OP_CLEAN:
				OSCPUCacheCleanRangeKM(psDevNode, pvVirtStart, pvVirtEnd, sCPUPhysStart, sCPUPhysEnd);
				break;
			case PVRSRV_CACHE_OP_INVALIDATE:
				OSCPUCacheInvalidateRangeKM(psDevNode, pvVirtStart, pvVirtEnd, sCPUPhysStart, sCPUPhysEnd);
				break;
			case PVRSRV_CACHE_OP_FLUSH:
				OSCPUCacheFlushRangeKM(psDevNode, pvVirtStart, pvVirtEnd, sCPUPhysStart, sCPUPhysEnd);
				break;
			default:
				PVR_DPF((PVR_DBG_ERROR,	"%s: Invalid cache operation type %d",
						 __func__, uiCacheOp));
				break;
		}
		eError = PVRSRV_OK;
	}

#if	defined(CACHEOP_DEBUG)
	if (! CacheOpConfigSupports(CACHEOP_CONFIG_KLOG))
	{
		if (bUsedGlobalFlush)
		{
			/* Undo the accounting for server GF done in CacheOpGlobalFlush() */
			gsCwq.ui32ServerGF -= 1;
		}
	}
	else
	{
		gsCwq.ui32TotalExecOps += 1;
		if (! bUsedGlobalFlush)
		{
			gsCwq.ui32ServerSync += 1;
			gsCwq.ui32ServerRBF +=
				((sCPUPhysEnd.uiAddr - sCPUPhysStart.uiAddr) & ((IMG_DEVMEM_SIZE_T)~(gsCwq.uiLineSize - 1))) >> gsCwq.uiLineShift;
		}
		sCacheOpWorkItem.uiOffset = 0;
		sCacheOpWorkItem.bKMReq = IMG_TRUE;
		sCacheOpWorkItem.uiCacheOp = uiCacheOp;
		sCacheOpWorkItem.bRBF = !bUsedGlobalFlush;
		/* Use information page PMR for logging KM request */
		sCacheOpWorkItem.psPMR = gsCwq.psInfoPagePMR;
		sCacheOpWorkItem.ui64ExecuteTime = OSClockns64();
		sCacheOpWorkItem.pid = OSGetCurrentClientProcessIDKM();
		sCacheOpWorkItem.ui32OpSeqNum = CacheOpGetNextCommonSeqNum();
		sCacheOpWorkItem.uiSize = (sCPUPhysEnd.uiAddr - sCPUPhysStart.uiAddr);
		CacheOpStatsExecLogWrite(&sCacheOpWorkItem);
	}
#endif

	return eError;
}

PVRSRV_ERROR CacheOpValExec(PMR *psPMR,
						    IMG_UINT64 uiAddress,
						    IMG_DEVMEM_OFFSET_T uiOffset,
						    IMG_DEVMEM_SIZE_T uiSize,
						    PVRSRV_CACHE_OP uiCacheOp)
{
	PVRSRV_ERROR eError;
	IMG_CPU_VIRTADDR pvAddress = (IMG_CPU_VIRTADDR)(uintptr_t)uiAddress;
	IMG_BOOL bUseGlobalFlush = (IMG_UINT32)uiSize > gsCwq.pui32InfoPage[CACHEOP_INFO_KMGFTHRESHLD];
#if	defined(CACHEOP_DEBUG)
	CACHEOP_WORK_ITEM sCacheOpWorkItem = {0};
	gsCwq.ui32TotalExecOps += 1;
	gsCwq.ui32ServerSync += 1;
	sCacheOpWorkItem.psPMR = psPMR;
	sCacheOpWorkItem.uiSize = uiSize;
	sCacheOpWorkItem.uiOffset = uiOffset;
	sCacheOpWorkItem.uiCacheOp = uiCacheOp;
	sCacheOpWorkItem.pid = OSGetCurrentClientProcessIDKM();
	sCacheOpWorkItem.ui32OpSeqNum = CacheOpGetNextCommonSeqNum();
	sCacheOpWorkItem.ui64EnqueuedTime = OSClockns64();
#endif

	eError = CacheOpPMRExec(psPMR,
							pvAddress,
							uiOffset,
							uiSize,
							uiCacheOp,
							gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0],
							IMG_FALSE,
							&bUseGlobalFlush);
	PVR_LOGG_IF_ERROR(eError, "CacheOpPMRExec", e0);

#if	defined(CACHEOP_DEBUG)
	sCacheOpWorkItem.bRBF = !bUseGlobalFlush;
	sCacheOpWorkItem.ui64ExecuteTime = OSClockns64();
	CacheOpStatsExecLogWrite(&sCacheOpWorkItem);
#endif

e0:
	return eError;
}

PVRSRV_ERROR CacheOpQueue (CONNECTION_DATA *psConnection,
						   PVRSRV_DEVICE_NODE *psDevNode,
						   IMG_UINT32 ui32NumCacheOps,
						   PMR **ppsPMR,
						   IMG_UINT64 *puiAddress,
						   IMG_DEVMEM_OFFSET_T *puiOffset,
						   IMG_DEVMEM_SIZE_T *puiSize,
						   PVRSRV_CACHE_OP *puiCacheOp,
						   IMG_UINT32 ui32OpTimeline,
						   IMG_UINT32 ui32ClientGFSeqNum,
						   IMG_UINT32 uiCurrentFenceSeqNum,
						   IMG_UINT32 *pui32NextFenceSeqNum)
{
	PVRSRV_ERROR eError;
	PVRSRV_TIMELINE uiTimeline = (PVRSRV_TIMELINE)ui32OpTimeline;
	IMG_CPU_VIRTADDR *pvAddress = (IMG_CPU_VIRTADDR*)(uintptr_t)puiAddress;

	PVR_UNREFERENCED_PARAMETER(psConnection);

#if !defined(CACHEFLUSH_ISA_SUPPORTS_GLOBAL_FLUSH)
	PVR_LOGR_IF_FALSE((ui32ClientGFSeqNum == 0),
					  "CacheOpQueue(ui32ClientGFSeqNum > 0)",
	                  PVRSRV_ERROR_INVALID_PARAMS);
#endif
#if defined(CACHEOP_DEBUG)
	gsCwq.ui32TotalExecOps += ui32NumCacheOps;
#endif

	if (! gsCwq.bInit)
	{
		PVR_LOG(("CacheOp framework not initialised, failing request"));
		return PVRSRV_ERROR_NOT_INITIALISED;
	}
	else if (! ui32NumCacheOps)
	{
		return PVRSRV_ERROR_INVALID_PARAMS;
	}
	/* Ensure any single timeline CacheOp request is processed immediately */
	else if (ui32NumCacheOps == 1 && puiCacheOp[0] == PVRSRV_CACHE_OP_TIMELINE)
	{
		eError = CacheOpBatchExecTimeline(psDevNode, uiTimeline, IMG_TRUE, uiCurrentFenceSeqNum, pui32NextFenceSeqNum);
	}
	/* Services client explicitly requested a GF or config is GF only (i.e. no KRBF support), this takes priority */
	else if (CacheOpConfigSupports(CACHEOP_CONFIG_KGF) &&
			 ((puiCacheOp[ui32NumCacheOps-1] & PVRSRV_CACHE_OP_GLOBAL) || !CacheOpConfigSupports(CACHEOP_CONFIG_KRBF)))
	{
		eError =
			CacheOpBatchExecGlobal(psDevNode,
								   ppsPMR,
								   pvAddress,
								   puiOffset,
								   puiSize,
								   puiCacheOp,
								   ui32NumCacheOps,
								   uiTimeline,
								   ui32ClientGFSeqNum,
								   uiCurrentFenceSeqNum,
								   pui32NextFenceSeqNum);
	}
	/* This is the default entry for all client requests */
	else
	{
		if (!(gsCwq.eConfig & (CACHEOP_CONFIG_LAST-1)))
		{
			/* default the configuration before execution */
			CacheOpConfigUpdate(CACHEOP_CONFIG_DEFAULT);
		}

		eError =
			CacheOpBatchExecRangeBased(psDevNode,
									   ppsPMR,
									   pvAddress,
									   puiOffset,
									   puiSize,
									   puiCacheOp,
									   ui32NumCacheOps,
									   uiTimeline,
									   ui32ClientGFSeqNum,
									   uiCurrentFenceSeqNum,
									   pui32NextFenceSeqNum);
	}

	return eError;
}

PVRSRV_ERROR CacheOpFence (RGXFWIF_DM eFenceOpType, IMG_UINT32 ui32FenceOpSeqNum)
{
	IMG_HANDLE hOSEvent;
	PVRSRV_ERROR eError2;
	IMG_UINT32 ui32RetryAbort;
	IMG_UINT32 ui32CompletedOpSeqNum;
	PVRSRV_ERROR eError = PVRSRV_OK;
#if defined(CACHEOP_DEBUG)
	IMG_UINT64 uiTimeNow;
	CACHEOP_WORK_ITEM sCacheOpWorkItem = {0};
	sCacheOpWorkItem.pid = OSGetCurrentClientProcessIDKM();
	sCacheOpWorkItem.ui32OpSeqNum = ui32FenceOpSeqNum;
	sCacheOpWorkItem.ui64EnqueuedTime = OSClockns64();
	uiTimeNow = sCacheOpWorkItem.ui64EnqueuedTime;
#if defined(PVRSRV_ENABLE_GPU_MEMORY_INFO) && defined(DEBUG)
	sCacheOpWorkItem.eFenceOpType = eFenceOpType;
#endif
	sCacheOpWorkItem.uiSize = (uintptr_t) OSAtomicRead(&gsCwq.hCompletedSeqNum);
	sCacheOpWorkItem.uiOffset = (uintptr_t) gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0];
#endif
	PVR_UNREFERENCED_PARAMETER(eFenceOpType);

	/* CacheOp(s) this thread is fencing for has already been satisfied by an
	   else-when GF. Another way of looking at this, if last else-when GF is
	   logically behind or momentarily disabled (zero) then we have to flush
	   the cache */
	if (gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] > ui32FenceOpSeqNum)
	{
#if defined(CACHEOP_DEBUG)
		sCacheOpWorkItem.uiOffset = (uintptr_t) gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0];
#endif
		goto e0;
	}

	/* If initial fence check fails, then wait-and-retry in loop */
	ui32CompletedOpSeqNum = OSAtomicRead(&gsCwq.hCompletedSeqNum);
	if (CacheOpFenceCheck(ui32CompletedOpSeqNum, ui32FenceOpSeqNum))
	{
#if defined(CACHEOP_DEBUG)
		sCacheOpWorkItem.uiSize = (uintptr_t) ui32CompletedOpSeqNum;
#endif
		goto e0;
	}

	/* Open CacheOp update event object, if event open fails return error */
	eError2 = OSEventObjectOpen(gsCwq.hClientWakeUpEvtObj, &hOSEvent);
	PVR_LOGG_IF_ERROR(eError2, "OSEventObjectOpen", e0);

	/* Linear (i.e. use exponential?) back-off, upper bounds user wait */
	for (ui32RetryAbort = gsCwq.ui32FenceRetryAbort; ;--ui32RetryAbort)
	{
		/* (Re)read completed CacheOp sequence number before waiting */
		ui32CompletedOpSeqNum = OSAtomicRead(&gsCwq.hCompletedSeqNum);
		if (CacheOpFenceCheck(ui32CompletedOpSeqNum, ui32FenceOpSeqNum))
		{
#if defined(CACHEOP_DEBUG)
			sCacheOpWorkItem.uiSize = (uintptr_t) ui32CompletedOpSeqNum;
#endif
			break;
		}

		/*
		   For cache ISA with GF support, the wait(ms) must be set to be around
		   25% GF overhead and as such there is no point waiting longer, we just
		   perform a GF as it means the CacheOp thread is really lagging behind.
		   Lastly, we cannot (or should not) hang the client thread indefinitely
		   so after a certain duration, we just give up. What this duration is,
		   is hard to state but for now we set it to be 1 seconds, which is the
		   product of CACHEOP_FENCE_[WAIT_TIMEOUT * RETRY_ABORT]. We ask the
		   client to retry the operation by exiting with PVRSRV_ERROR_RETRY.
		*/
		(void) OSEventObjectWaitTimeout(hOSEvent, gsCwq.ui32FenceWaitTimeUs);
		if (gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] > ui32FenceOpSeqNum)
		{
#if defined(CACHEOP_DEBUG)
			sCacheOpWorkItem.uiOffset = (uintptr_t) gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0];
			uiTimeNow = OSClockns64();
#endif
			break;
		}
		else if (CacheOpConfigSupports(CACHEOP_CONFIG_KGF))
		{
			eError2 = CacheOpGlobalFlush();
			PVR_LOG_IF_ERROR(eError2, "CacheOpGlobalFlush");
#if defined(CACHEOP_DEBUG)
			sCacheOpWorkItem.uiCacheOp = PVRSRV_CACHE_OP_GLOBAL;
			sCacheOpWorkItem.uiOffset = (uintptr_t) gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0];
			uiTimeNow = OSClockns64();
#endif
			break;
		}
		else if (! ui32RetryAbort)
		{
#if defined(CACHEOP_DEBUG)
			sCacheOpWorkItem.uiSize = (uintptr_t) OSAtomicRead(&gsCwq.hCompletedSeqNum);
			sCacheOpWorkItem.uiOffset = (uintptr_t) gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0];
			uiTimeNow = OSClockns64();
#endif
			PVR_LOG(("CacheOpFence() event: "CACHEOP_ABORT_FENCE_ERROR_STRING));
			eError = PVRSRV_ERROR_RETRY;
			break;
		}
		else
		{
#if defined(CACHEOP_DEBUG)
			uiTimeNow = OSClockns64();
#endif
		}
	}

	eError2 = OSEventObjectClose(hOSEvent);
	PVR_LOG_IF_ERROR(eError2, "OSEventObjectOpen");

e0:
#if defined(CACHEOP_DEBUG)
	sCacheOpWorkItem.ui64ExecuteTime = uiTimeNow;
	if (ui32FenceOpSeqNum)
	{
		/* Only fence(s) pending on CacheOp(s) contribute towards statistics,
		   here we calculate the rolling approximate average waiting time
		   for these fence(s) */
		IMG_UINT32 ui64EnqueuedTime = sCacheOpWorkItem.ui64EnqueuedTime;
		IMG_UINT32 ui64ExecuteTime = sCacheOpWorkItem.ui64ExecuteTime;
		IMG_UINT32 ui32Time = ui64EnqueuedTime < ui64ExecuteTime ?
									ui64ExecuteTime - ui64EnqueuedTime :
									ui64EnqueuedTime - ui64ExecuteTime;
		ui32Time = DivBy10(DivBy10(DivBy10(ui32Time)));
		gsCwq.ui32TotalFenceOps += 1;
		if (gsCwq.ui32TotalFenceOps > 2)
		{
			gsCwq.ui32AvgFenceTime -= (gsCwq.ui32AvgFenceTime / gsCwq.ui32TotalFenceOps);
			gsCwq.ui32AvgFenceTime += (ui32Time / gsCwq.ui32TotalFenceOps);
		}
		else if (ui32Time)
		{
			gsCwq.ui32AvgFenceTime = (IMG_UINT32)ui32Time;
		}
	}
	CacheOpStatsExecLogWrite(&sCacheOpWorkItem);
#endif

	return eError;
}

PVRSRV_ERROR CacheOpLog (PMR *psPMR,
						 IMG_UINT64 puiAddress,
						 IMG_DEVMEM_OFFSET_T uiOffset,
						 IMG_DEVMEM_SIZE_T uiSize,
						 IMG_UINT64 ui64EnqueuedTimeUs,
						 IMG_UINT64 ui64ExecuteTimeUs,
						 IMG_UINT32 ui32NumRBF,
						 IMG_BOOL bIsDiscard,
						 PVRSRV_CACHE_OP uiCacheOp)
{
#if defined(CACHEOP_DEBUG)
	CACHEOP_WORK_ITEM sCacheOpWorkItem = {0};
	PVR_UNREFERENCED_PARAMETER(puiAddress);

	sCacheOpWorkItem.psPMR = psPMR;
	sCacheOpWorkItem.uiSize = uiSize;
	sCacheOpWorkItem.uiOffset = uiOffset;
	sCacheOpWorkItem.uiCacheOp = uiCacheOp;
	sCacheOpWorkItem.pid = OSGetCurrentClientProcessIDKM();
	sCacheOpWorkItem.ui32OpSeqNum = CacheOpGetNextCommonSeqNum();

	sCacheOpWorkItem.ui64EnqueuedTime = ui64EnqueuedTimeUs;
	sCacheOpWorkItem.ui64ExecuteTime = ui64ExecuteTimeUs;
	sCacheOpWorkItem.bUMF = IMG_TRUE;
	sCacheOpWorkItem.bRBF = bIsDiscard ? IMG_FALSE : IMG_TRUE;
	gsCwq.ui32UMDiscards += bIsDiscard ? 1 : 0;
	gsCwq.ui32ClientRBF += bIsDiscard ? 0 : ui32NumRBF;
	gsCwq.ui32ClientSync += 1;
	gsCwq.ui32TotalExecOps += 1;

	CacheOpStatsExecLogWrite(&sCacheOpWorkItem);
#else
	PVR_UNREFERENCED_PARAMETER(psPMR);
	PVR_UNREFERENCED_PARAMETER(uiSize);
	PVR_UNREFERENCED_PARAMETER(uiOffset);
	PVR_UNREFERENCED_PARAMETER(uiCacheOp);
	PVR_UNREFERENCED_PARAMETER(ui32NumRBF);
	PVR_UNREFERENCED_PARAMETER(puiAddress);
	PVR_UNREFERENCED_PARAMETER(ui64ExecuteTimeUs);
	PVR_UNREFERENCED_PARAMETER(ui64EnqueuedTimeUs);
#endif
	return PVRSRV_OK;
}

PVRSRV_ERROR CacheOpInit2 (void)
{
	PVRSRV_ERROR eError;
	PVRSRV_DATA *psPVRSRVData = PVRSRVGetPVRSRVData();

	/* Create an event object for pending CacheOp work items */
	eError = OSEventObjectCreate("PVRSRV_CACHEOP_EVENTOBJECT", &gsCwq.hThreadWakeUpEvtObj);
	PVR_LOGG_IF_ERROR(eError, "OSEventObjectCreate", e0);

	/* Create an event object for updating pending fence checks on CacheOp */
	eError = OSEventObjectCreate("PVRSRV_CACHEOP_EVENTOBJECT", &gsCwq.hClientWakeUpEvtObj);
	PVR_LOGG_IF_ERROR(eError, "OSEventObjectCreate", e0);

	/* Appending work-items is not concurrent, lock protects against this */
	eError = OSLockCreate((POS_LOCK*)&gsCwq.hDeferredLock);
	PVR_LOGG_IF_ERROR(eError, "OSLockCreate", e0);

	/* Apphint read/write is not concurrent, so lock protects against this */
	eError = OSLockCreate((POS_LOCK*)&gsCwq.hConfigLock);
	PVR_LOGG_IF_ERROR(eError, "OSLockCreate", e0);

	/* Determine CPU cache ISA maintenance mechanism available, GF and UMF */
#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
	gsCwq.bNoGlobalFlushImpl = IMG_TRUE;
#else
	gsCwq.bNoGlobalFlushImpl = (OSCPUOperation(PVRSRV_CACHE_OP_FLUSH) != PVRSRV_OK) ? IMG_TRUE : IMG_FALSE;
#endif
	if (! gsCwq.bNoGlobalFlushImpl)
	{
		IMG_UINT64 uiIdx;
		IMG_UINT64 uiTime = 0;
		IMG_UINT64 uiTimeAfter;
		IMG_UINT64 uiTimeBefore;

		for (uiIdx = 0; uiIdx < 4; uiIdx++)
		{
			/* Take average of four GF */
			uiTimeBefore = OSClockns64();
			(void) OSCPUOperation(PVRSRV_CACHE_OP_FLUSH);
			uiTimeAfter = OSClockns64();

			uiTimeBefore = DivBy10(DivBy10(DivBy10(uiTimeBefore)));
			uiTimeAfter = DivBy10(DivBy10(DivBy10(uiTimeAfter)));
			uiTime += uiTimeBefore < uiTimeAfter ?
								uiTimeAfter  - uiTimeBefore :
								uiTimeBefore - uiTimeAfter;
		}

		gsCwq.ui32FenceWaitTimeUs = (IMG_UINT32)(uiTime >> 2);
		gsCwq.ui32FenceRetryAbort = ~0;
	}
	else
	{
		gsCwq.ui32FenceWaitTimeUs = CACHEOP_FENCE_WAIT_TIMEOUT;
		gsCwq.ui32FenceRetryAbort = CACHEOP_FENCE_RETRY_ABORT;
	}
#if defined(CACHEFLUSH_ISA_SUPPORTS_UM_FLUSH)
	gsCwq.bSupportsUMFlush = IMG_TRUE;
#else
	gsCwq.bSupportsUMFlush = IMG_FALSE;
#endif

	gsCwq.pui32InfoPage = psPVRSRVData->pui32InfoPage;
	gsCwq.psInfoPagePMR = psPVRSRVData->psInfoPagePMR;

	/* Normally, platforms should use their default configurations, put exceptions here */
#if defined(__i386__) || defined(__x86_64__)
#if !defined(TC_MEMORY_CONFIG)
	CacheOpConfigUpdate(CACHEOP_CONFIG_URBF | CACHEOP_CONFIG_KGF | CACHEOP_CONFIG_KDF);
#else
	CacheOpConfigUpdate(CACHEOP_CONFIG_KGF | CACHEOP_CONFIG_KDF);
#endif
#else /* defined(__x86__) */
	CacheOpConfigUpdate(CACHEOP_CONFIG_DEFAULT);
#endif

	/* Initialise the remaining occupants of the CacheOp information page */
	gsCwq.pui32InfoPage[CACHEOP_INFO_PGSIZE]    = (IMG_UINT32)gsCwq.uiPageSize;
	gsCwq.pui32InfoPage[CACHEOP_INFO_LINESIZE]  = (IMG_UINT32)gsCwq.uiLineSize;
	gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM0] = (IMG_UINT32)0;
	gsCwq.pui32InfoPage[CACHEOP_INFO_GFSEQNUM1] = (IMG_UINT32)0;

	/* Set before spawning thread */
	gsCwq.bInit = IMG_TRUE;

	/* Create a thread which is used to execute the deferred CacheOp(s),
	   these are CacheOp(s) executed by the server on behalf of clients
	   asynchronously. All clients synchronise with the server before
	   submitting any HW operation (i.e. device kicks) to ensure that
	   client device work-load memory is coherent */
	eError = OSThreadCreatePriority(&gsCwq.hWorkerThread,
									"pvr_cacheop",
									CacheOpThread,
									CacheOpThreadDumpInfo,
									IMG_TRUE,
									psPVRSRVData,
									OS_THREAD_HIGHEST_PRIORITY);
	PVR_LOGG_IF_ERROR(eError, "OSThreadCreatePriority", e0);

	/* Writing the unsigned integer binary encoding of CACHEOP_CONFIG
	   into this file cycles through avail. configuration(s) */
	gsCwq.pvConfigTune = OSCreateStatisticEntry("cacheop_config",
											NULL,
											CacheOpConfigRead,
											NULL);
	PVR_LOGG_IF_FALSE(gsCwq.pvConfigTune, "OSCreateStatisticEntry", e0);

	/* Register the CacheOp framework (re)configuration handlers */
	PVRSRVAppHintRegisterHandlersUINT32(APPHINT_ID_CacheOpConfig,
										CacheOpConfigQuery,
										CacheOpConfigSet,
										APPHINT_OF_DRIVER_NO_DEVICE,
										(void *) APPHINT_ID_CacheOpConfig);

	PVRSRVAppHintRegisterHandlersUINT32(APPHINT_ID_CacheOpGFThresholdSize,
										CacheOpConfigQuery,
										CacheOpConfigSet,
										APPHINT_OF_DRIVER_NO_DEVICE,
										(void *) APPHINT_ID_CacheOpGFThresholdSize);

	PVRSRVAppHintRegisterHandlersUINT32(APPHINT_ID_CacheOpUMKMThresholdSize,
										CacheOpConfigQuery,
										CacheOpConfigSet,
										APPHINT_OF_DRIVER_NO_DEVICE,
										(void *) APPHINT_ID_CacheOpUMKMThresholdSize);

	return PVRSRV_OK;
e0:
	CacheOpDeInit2();
	return eError;
}

void CacheOpDeInit2 (void)
{
	PVRSRV_ERROR eError = PVRSRV_OK;

	gsCwq.bInit = IMG_FALSE;

	if (gsCwq.hThreadWakeUpEvtObj)
	{
		eError = OSEventObjectSignal(gsCwq.hThreadWakeUpEvtObj);
		PVR_LOG_IF_ERROR(eError, "OSEventObjectSignal");
	}

	if (gsCwq.hClientWakeUpEvtObj)
	{
		eError = OSEventObjectSignal(gsCwq.hClientWakeUpEvtObj);
		PVR_LOG_IF_ERROR(eError, "OSEventObjectSignal");
	}

	if (gsCwq.hWorkerThread)
	{
		LOOP_UNTIL_TIMEOUT(OS_THREAD_DESTROY_TIMEOUT_US)
		{
			eError = OSThreadDestroy(gsCwq.hWorkerThread);
			if (PVRSRV_OK == eError)
			{
				gsCwq.hWorkerThread = NULL;
				break;
			}
			OSWaitus(OS_THREAD_DESTROY_TIMEOUT_US/OS_THREAD_DESTROY_RETRY_COUNT);
		} END_LOOP_UNTIL_TIMEOUT();
		PVR_LOG_IF_ERROR(eError, "OSThreadDestroy");
		gsCwq.hWorkerThread = NULL;
	}

	if (gsCwq.hClientWakeUpEvtObj)
	{
		eError = OSEventObjectDestroy(gsCwq.hClientWakeUpEvtObj);
		PVR_LOG_IF_ERROR(eError, "OSEventObjectDestroy");
		gsCwq.hClientWakeUpEvtObj = NULL;
	}

	if (gsCwq.hThreadWakeUpEvtObj)
	{
		eError = OSEventObjectDestroy(gsCwq.hThreadWakeUpEvtObj);
		PVR_LOG_IF_ERROR(eError, "OSEventObjectDestroy");
		gsCwq.hThreadWakeUpEvtObj = NULL;
	}

	if (gsCwq.hConfigLock)
	{
		eError = OSLockDestroy(gsCwq.hConfigLock);
		PVR_LOG_IF_ERROR(eError, "OSLockDestroy");
		gsCwq.hConfigLock = NULL;
	}

	if (gsCwq.hDeferredLock)
	{
		eError = OSLockDestroy(gsCwq.hDeferredLock);
		PVR_LOG_IF_ERROR(eError, "OSLockDestroy");
		gsCwq.hDeferredLock = NULL;
	}

	if (gsCwq.pvConfigTune)
	{
		OSRemoveStatisticEntry(&gsCwq.pvConfigTune);
	}

	gsCwq.pui32InfoPage = NULL;
	gsCwq.psInfoPagePMR = NULL;
}

PVRSRV_ERROR CacheOpInit (void)
{
	IMG_UINT32 idx;
	PVRSRV_ERROR eError = PVRSRV_OK;

	/* DDK initialisation is anticipated to be performed on the boot
	   processor (little core in big/little systems) though this may
	   not always be the case. If so, the value cached here is the
	   system wide safe (i.e. smallest) L1 d-cache line size value
	   on any/such platforms with mismatched d-cache line sizes */
	gsCwq.uiPageSize = OSGetPageSize();
	gsCwq.uiPageShift = OSGetPageShift();
	gsCwq.uiLineSize = OSCPUCacheAttributeSize(PVR_DCACHE_LINE_SIZE);
	gsCwq.uiLineShift = ExactLog2(gsCwq.uiLineSize);
	PVR_LOGR_IF_FALSE((gsCwq.uiLineSize && gsCwq.uiPageSize && gsCwq.uiPageShift), "", PVRSRV_ERROR_INIT_FAILURE);
	gsCwq.uiCacheOpAddrType = OSCPUCacheOpAddressType();

	/* More information regarding these atomic counters can be found
	   in the CACHEOP_WORK_QUEUE type definition at top of file */
	OSAtomicWrite(&gsCwq.hCompletedSeqNum, 0);
	OSAtomicWrite(&gsCwq.hCommonSeqNum, 0);
	OSAtomicWrite(&gsCwq.hDeferredSize, 0);
	OSAtomicWrite(&gsCwq.hWriteCounter, 0);
	OSAtomicWrite(&gsCwq.hReadCounter, 0);

	for (idx = 0; idx < CACHEOP_INDICES_MAX; idx++)
	{
		gsCwq.asWorkItems[idx].iTimeline = PVRSRV_NO_TIMELINE;
		gsCwq.asWorkItems[idx].psPMR = (void *)(uintptr_t)~0;
		gsCwq.asWorkItems[idx].ui32OpSeqNum = (IMG_UINT32)~0;
		gsCwq.asWorkItems[idx].ui32GFSeqNum = (IMG_UINT32)~0;
	}

	/* Lock prevents multiple threads from issuing surplus to requirement GF */
	eError = OSLockCreate((POS_LOCK*)&gsCwq.hGlobalFlushLock);
	PVR_LOGG_IF_ERROR(eError, "OSLockCreate", e0);

#if defined(CACHEOP_DEBUG)
	/* debugfs file read-out is not concurrent, so lock protects against this */
	eError = OSLockCreate((POS_LOCK*)&gsCwq.hStatsExecLock);
	PVR_LOGG_IF_ERROR(eError, "OSLockCreate", e0);

	gsCwq.i32StatsExecWriteIdx = 0;
	OSCachedMemSet(gsCwq.asStatsExecuted, 0, sizeof(gsCwq.asStatsExecuted));

	/* File captures the most recent subset of CacheOp(s) executed */
	gsCwq.pvStatsEntry = OSCreateStatisticEntry("cacheop_history",
												NULL,
												CacheOpStatsExecLogRead,
												NULL);
	PVR_LOGG_IF_ERROR(eError, "OSCreateStatisticEntry", e0);
#endif

e0:
	return eError;
}

void CacheOpDeInit (void)
{
#if defined(CACHEOP_DEBUG)
	if (gsCwq.hStatsExecLock)
	{
		(void) OSLockDestroy(gsCwq.hStatsExecLock);
		gsCwq.hStatsExecLock = NULL;
	}

	if (gsCwq.pvStatsEntry)
	{
		OSRemoveStatisticEntry(&gsCwq.pvStatsEntry);
	}
#endif
	if (gsCwq.hGlobalFlushLock)
	{
		(void) OSLockDestroy(gsCwq.hGlobalFlushLock);
		gsCwq.hGlobalFlushLock = NULL;
	}
}
